Actual source code: baij2.c

petsc-3.5.4 2015-05-23
Report Typos and Errors
  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <petsc-private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>
  6: #if defined(PETSC_THREADCOMM_ACTIVE)
  7: #include <petscthreadcomm.h>
  8: #endif

 12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 13: {
 14:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 17:   const PetscInt *idx;
 18:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 19:   PetscBT        table;

 22:   m  = a->mbs;
 23:   ai = a->i;
 24:   aj = a->j;
 25:   bs = A->rmap->bs;

 27:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 29:   PetscBTCreate(m,&table);
 30:   PetscMalloc1((m+1),&nidx);
 31:   PetscMalloc1((A->rmap->N+1),&nidx2);

 33:   for (i=0; i<is_max; i++) {
 34:     /* Initialise the two local arrays */
 35:     isz  = 0;
 36:     PetscBTMemzero(m,table);

 38:     /* Extract the indices, assume there can be duplicate entries */
 39:     ISGetIndices(is[i],&idx);
 40:     ISGetLocalSize(is[i],&n);

 42:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 43:     for (j=0; j<n; ++j) {
 44:       ival = idx[j]/bs; /* convert the indices into block indices */
 45:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 46:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 47:     }
 48:     ISRestoreIndices(is[i],&idx);
 49:     ISDestroy(&is[i]);

 51:     k = 0;
 52:     for (j=0; j<ov; j++) { /* for each overlap*/
 53:       n = isz;
 54:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 55:         row   = nidx[k];
 56:         start = ai[row];
 57:         end   = ai[row+1];
 58:         for (l = start; l<end; l++) {
 59:           val = aj[l];
 60:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 61:         }
 62:       }
 63:     }
 64:     /* expand the Index Set */
 65:     for (j=0; j<isz; j++) {
 66:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 67:     }
 68:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 69:   }
 70:   PetscBTDestroy(&table);
 71:   PetscFree(nidx);
 72:   PetscFree(nidx2);
 73:   return(0);
 74: }

 78: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 79: {
 80:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 82:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 83:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 84:   const PetscInt *irow,*icol;
 85:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 86:   PetscInt       *aj = a->j,*ai = a->i;
 87:   MatScalar      *mat_a;
 88:   Mat            C;
 89:   PetscBool      flag,sorted;

 92:   ISSorted(iscol,&sorted);
 93:   if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 95:   ISGetIndices(isrow,&irow);
 96:   ISGetIndices(iscol,&icol);
 97:   ISGetLocalSize(isrow,&nrows);
 98:   ISGetLocalSize(iscol,&ncols);

100:   PetscCalloc1((1+oldcols),&smap);
101:   ssmap = smap;
102:   PetscMalloc1((1+nrows),&lens);
103:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104:   /* determine lens of each row */
105:   for (i=0; i<nrows; i++) {
106:     kstart  = ai[irow[i]];
107:     kend    = kstart + a->ilen[irow[i]];
108:     lens[i] = 0;
109:     for (k=kstart; k<kend; k++) {
110:       if (ssmap[aj[k]]) lens[i]++;
111:     }
112:   }
113:   /* Create and fill new matrix */
114:   if (scall == MAT_REUSE_MATRIX) {
115:     c = (Mat_SeqBAIJ*)((*B)->data);

117:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
121:     C    = *B;
122:   } else {
123:     MatCreate(PetscObjectComm((PetscObject)A),&C);
124:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
125:     MatSetType(C,((PetscObject)A)->type_name);
126:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
127:   }
128:   c = (Mat_SeqBAIJ*)(C->data);
129:   for (i=0; i<nrows; i++) {
130:     row      = irow[i];
131:     kstart   = ai[row];
132:     kend     = kstart + a->ilen[row];
133:     mat_i    = c->i[i];
134:     mat_j    = c->j + mat_i;
135:     mat_a    = c->a + mat_i*bs2;
136:     mat_ilen = c->ilen + i;
137:     for (k=kstart; k<kend; k++) {
138:       if ((tcol=ssmap[a->j[k]])) {
139:         *mat_j++ = tcol - 1;
140:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
141:         mat_a   += bs2;
142:         (*mat_ilen)++;
143:       }
144:     }
145:   }

147:   /* Free work space */
148:   ISRestoreIndices(iscol,&icol);
149:   PetscFree(smap);
150:   PetscFree(lens);
151:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
152:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

154:   ISRestoreIndices(isrow,&irow);
155:   *B   = C;
156:   return(0);
157: }

161: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
162: {
163:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
164:   IS             is1,is2;
166:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
167:   const PetscInt *irow,*icol;

170:   ISGetIndices(isrow,&irow);
171:   ISGetIndices(iscol,&icol);
172:   ISGetLocalSize(isrow,&nrows);
173:   ISGetLocalSize(iscol,&ncols);

175:   /* Verify if the indices corespond to each element in a block
176:    and form the IS with compressed IS */
177:   PetscMalloc2(a->mbs,&vary,a->mbs,&iary);
178:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
179:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180:   count = 0;
181:   for (i=0; i<a->mbs; i++) {
182:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183:     if (vary[i]==bs) iary[count++] = i;
184:   }
185:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

187:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
188:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189:   count = 0;
190:   for (i=0; i<a->mbs; i++) {
191:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
192:     if (vary[i]==bs) iary[count++] = i;
193:   }
194:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
195:   ISRestoreIndices(isrow,&irow);
196:   ISRestoreIndices(iscol,&icol);
197:   PetscFree2(vary,iary);

199:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
200:   ISDestroy(&is1);
201:   ISDestroy(&is2);
202:   return(0);
203: }

207: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208: {
210:   PetscInt       i;

213:   if (scall == MAT_INITIAL_MATRIX) {
214:     PetscMalloc1((n+1),B);
215:   }

217:   for (i=0; i<n; i++) {
218:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
219:   }
220:   return(0);
221: }


224: /* -------------------------------------------------------*/
225: /* Should check that shapes of vectors and matrices match */
226: /* -------------------------------------------------------*/

228: #if defined(PETSC_THREADCOMM_ACTIVE)
229: PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
230: {
231:   PetscErrorCode    ierr;
232:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
233:   PetscScalar       *z;
234:   const PetscScalar *x;
235:   const MatScalar   *aa;
236:   PetscInt          *trstarts=A->rmap->trstarts;
237:   PetscInt          n,start,end,i;
238:   const PetscInt    *aj,*ai;
239:   PetscScalar       sum;

241:   VecGetArrayRead(xx,&x);
242:   VecGetArray(zz,&z);
243:   start = trstarts[thread_id];
244:   end   = trstarts[thread_id+1];
245:   ai    = a->i;
246:   for (i=start; i<end; i++) {
247:     n   = ai[i+1] - ai[i];
248:     aj  = a->j + ai[i];
249:     aa  = a->a + ai[i];
250:     sum = 0.0;
251:     PetscSparseDensePlusDot(sum,x,aa,aj,n);
252:     z[i] = sum;
253:   }
254:   VecRestoreArrayRead(xx,&x);
255:   VecRestoreArray(zz,&z);
256:   return 0;
257: }

261: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
262: {
263:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
264:   PetscScalar       *z,sum;
265:   const PetscScalar *x;
266:   const MatScalar   *v;
267:   PetscErrorCode    ierr;
268:   PetscInt          mbs,i,n;
269:   const PetscInt    *idx,*ii,*ridx=NULL;
270:   PetscBool         usecprow=a->compressedrow.use;


274:   if (usecprow) {
275:     VecGetArrayRead(xx,&x);
276:     VecGetArray(zz,&z);
277:     mbs  = a->compressedrow.nrows;
278:     ii   = a->compressedrow.i;
279:     ridx = a->compressedrow.rindex;
280:     PetscMemzero(z,mbs*sizeof(PetscScalar));
281:     for (i=0; i<mbs; i++) {
282:       n   = ii[i+1] - ii[i];
283:       v   = a->a + ii[i];
284:       idx = a->j + ii[i];
285:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
286:       PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
287:       sum = 0.0;
288:       PetscSparseDensePlusDot(sum,x,v,idx,n);
289:       z[ridx[i]] = sum;
290:     }
291:     VecRestoreArrayRead(xx,&x);
292:     VecRestoreArray(zz,&z);
293:   } else {
294:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);
295:   }
296:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
297:   return(0);
298: }
299: #else
302: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
303: {
304:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
305:   PetscScalar       *z,sum;
306:   const PetscScalar *x;
307:   const MatScalar   *v;
308:   PetscErrorCode    ierr;
309:   PetscInt          mbs,i,n;
310:   const PetscInt    *idx,*ii,*ridx=NULL;
311:   PetscBool         usecprow=a->compressedrow.use;

314:   VecGetArrayRead(xx,&x);
315:   VecGetArray(zz,&z);

317:   if (usecprow) {
318:     mbs  = a->compressedrow.nrows;
319:     ii   = a->compressedrow.i;
320:     ridx = a->compressedrow.rindex;
321:     PetscMemzero(z,mbs*sizeof(PetscScalar));
322:   } else {
323:     mbs = a->mbs;
324:     ii  = a->i;
325:   }

327:   for (i=0; i<mbs; i++) {
328:     n   = ii[1] - ii[0];
329:     v   = a->a + ii[0];
330:     idx = a->j + ii[0];
331:     ii++;
332:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
333:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
334:     sum = 0.0;
335:     PetscSparseDensePlusDot(sum,x,v,idx,n);
336:     if (usecprow) {
337:       z[ridx[i]] = sum;
338:     } else {
339:       z[i]        = sum;
340:     }
341:   }
342:   VecRestoreArrayRead(xx,&x);
343:   VecRestoreArray(zz,&z);
344:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
345:   return(0);
346: }
347: #endif

349: #if defined(PETSC_THREADCOMM_ACTIVE)
350: PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
351: {
352:   PetscErrorCode    ierr;
353:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
354:   PetscScalar       *z,x1,x2,sum1,sum2;
355:   const PetscScalar *x,*xb;
356:   const MatScalar   *aa;
357:   PetscInt          *trstarts=A->rmap->trstarts;
358:   PetscInt          n,start,end,i,j;
359:   const PetscInt    *aj,*ai;

361:   VecGetArrayRead(xx,&x);
362:   VecGetArray(zz,&z);
363:   start  = trstarts[thread_id] / 2;
364:   end    = trstarts[thread_id+1] / 2;
365:   ai     = a->i;
366:   for (i=start; i<end; i++) {
367:     n    = ai[i+1] - ai[i];
368:     aj   = a->j + ai[i];
369:     aa   = a->a + ai[i]*4;
370:     sum1 = 0.0; sum2 = 0.0;
371:     for (j=0; j<n; j++) {
372:       xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
373:       sum1 += aa[4*j]*x1   + aa[4*j+2]*x2;
374:       sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
375:     }
376:     z[2*i] = sum1; z[2*i+1] = sum2;
377:   }
378:   VecRestoreArrayRead(xx,&x);
379:   VecRestoreArray(zz,&z);
380:   return 0;
381: }

385: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
386: {
387:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
388:   PetscScalar       *z,x1,x2,sum1,sum2;
389:   const PetscScalar *x,*xb;
390:   const MatScalar   *v;
391:   PetscErrorCode    ierr;
392:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
393:   PetscBool         usecprow=a->compressedrow.use;


397:   if (usecprow) {
398:     VecGetArrayRead(xx,&x);
399:     VecGetArray(zz,&z);
400:     mbs  = a->compressedrow.nrows;
401:     ii   = a->compressedrow.i;
402:     ridx = a->compressedrow.rindex;
403:     for (i=0; i<mbs; i++) {
404:       n    = ii[i+1] - ii[i];
405:       idx  = a->j + ii[i];
406:       v    = a->a + ii[i]*4;
407:       sum1 = 0.0; sum2 = 0.0;
408:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409:       PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410:       for (j=0; j<n; j++) {
411:         xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
412:         sum1 += v[4*j]*x1   + v[4*j+2]*x2;
413:         sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
414:       }
415:       z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
416:     }
417:     VecRestoreArrayRead(xx,&x);
418:     VecRestoreArray(zz,&z);
419:   } else {
420:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);
421:   }
422:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
423:   return(0);
424: }
425: #else
428: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
429: {
430:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
431:   PetscScalar       *z = 0,sum1,sum2,*zarray;
432:   const PetscScalar *x,*xb;
433:   PetscScalar       x1,x2;
434:   const MatScalar   *v;
435:   PetscErrorCode    ierr;
436:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
437:   PetscBool         usecprow=a->compressedrow.use;

440:   VecGetArrayRead(xx,&x);
441:   VecGetArray(zz,&zarray);

443:   idx = a->j;
444:   v   = a->a;
445:   if (usecprow) {
446:     mbs  = a->compressedrow.nrows;
447:     ii   = a->compressedrow.i;
448:     ridx = a->compressedrow.rindex;
449:   } else {
450:     mbs = a->mbs;
451:     ii  = a->i;
452:     z   = zarray;
453:   }

455:   for (i=0; i<mbs; i++) {
456:     n           = ii[1] - ii[0]; ii++;
457:     sum1        = 0.0; sum2 = 0.0;
458:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
459:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
460:     for (j=0; j<n; j++) {
461:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
462:       sum1 += v[0]*x1 + v[2]*x2;
463:       sum2 += v[1]*x1 + v[3]*x2;
464:       v    += 4;
465:     }
466:     if (usecprow) z = zarray + 2*ridx[i];
467:     z[0] = sum1; z[1] = sum2;
468:     if (!usecprow) z += 2;
469:   }
470:   VecRestoreArrayRead(xx,&x);
471:   VecRestoreArray(zz,&zarray);
472:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
473:   return(0);
474: }
475: #endif

477: #if defined(PETSC_THREADCOMM_ACTIVE)
478: PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
479: {
480:   PetscErrorCode    ierr;
481:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
482:   PetscScalar       *z,x1,x2,x3,sum1,sum2,sum3;
483:   const PetscScalar *x,*xb;
484:   const MatScalar   *aa;
485:   PetscInt          *trstarts=A->rmap->trstarts;
486:   PetscInt          n,start,end,i,j;
487:   const PetscInt    *aj,*ai;

489:   VecGetArrayRead(xx,&x);
490:   VecGetArray(zz,&z);
491:   start  = trstarts[thread_id] / 3;
492:   end    = trstarts[thread_id+1] / 3;
493:   ai     = a->i;
494:   for (i=start; i<end; i++) {
495:     n    = ai[i+1] - ai[i];
496:     aj   = a->j + ai[i];
497:     aa   = a->a + ai[i]*9;
498:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
499:     for (j=0; j<n; j++) {
500:       xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
501:       sum1 += aa[9*j]*x1   + aa[9*j+3]*x2 + aa[9*j+6]*x3;
502:       sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
503:       sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
504:     }
505:     z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
506:   }
507:   VecRestoreArrayRead(xx,&x);
508:   VecRestoreArray(zz,&z);
509:   return 0;
510: }

514: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
515: {
516:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
517:   PetscScalar       *z,sum1,sum2,sum3,x1,x2,x3;
518:   const PetscScalar *x,*xb;
519:   const MatScalar   *v;
520:   PetscErrorCode    ierr;
521:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
522:   PetscBool         usecprow=a->compressedrow.use;


525: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
526: #pragma disjoint(*v,*z,*xb)
527: #endif


531:   if (usecprow) {
532:     VecGetArrayRead(xx,&x);
533:     VecGetArray(zz,&z);
534:     mbs  = a->compressedrow.nrows;
535:     ii   = a->compressedrow.i;
536:     ridx = a->compressedrow.rindex;
537:     for (i=0; i<mbs; i++) {
538:       n    = ii[i+1] - ii[i];
539:       idx  = a->j + ii[i];
540:       v    = a->a + ii[i]*9;
541:       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
542:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
543:       PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
544:       for (j=0; j<n; j++) {
545:         xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
546:         sum1 += v[9*j]*x1   + v[9*j+3]*x2 + v[9*j+6]*x3;
547:         sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
548:         sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
549:       }
550:       z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
551:     }
552:     VecRestoreArrayRead(xx,&x);
553:     VecRestoreArray(zz,&z);
554:   } else {
555:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);
556:   }
557:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
558:   return(0);
559: }
560: #else
563: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
564: {
565:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
566:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
567:   const PetscScalar *x,*xb;
568:   const MatScalar   *v;
569:   PetscErrorCode    ierr;
570:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
571:   PetscBool         usecprow=a->compressedrow.use;


574: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
575: #pragma disjoint(*v,*z,*xb)
576: #endif

579:   VecGetArrayRead(xx,&x);
580:   VecGetArray(zz,&zarray);

582:   idx = a->j;
583:   v   = a->a;
584:   if (usecprow) {
585:     mbs  = a->compressedrow.nrows;
586:     ii   = a->compressedrow.i;
587:     ridx = a->compressedrow.rindex;
588:   } else {
589:     mbs = a->mbs;
590:     ii  = a->i;
591:     z   = zarray;
592:   }

594:   for (i=0; i<mbs; i++) {
595:     n           = ii[1] - ii[0]; ii++;
596:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
597:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
598:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599:     for (j=0; j<n; j++) {
600:       xb = x + 3*(*idx++);
601:       x1 = xb[0];
602:       x2 = xb[1];
603:       x3 = xb[2];

605:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
606:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
607:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
608:       v    += 9;
609:     }
610:     if (usecprow) z = zarray + 3*ridx[i];
611:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
612:     if (!usecprow) z += 3;
613:   }
614:   VecRestoreArrayRead(xx,&x);
615:   VecRestoreArray(zz,&zarray);
616:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
617:   return(0);
618: }
619: #endif

621: #if defined(PETSC_THREADCOMM_ACTIVE)
622: PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
623: {
624:   PetscErrorCode    ierr;
625:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
626:   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
627:   const PetscScalar *x,*xb;
628:   const MatScalar   *aa;
629:   PetscInt          *trstarts=A->rmap->trstarts;
630:   PetscInt          n,start,end,i,j;
631:   const PetscInt    *aj,*ai;

633:   VecGetArrayRead(xx,&x);
634:   VecGetArray(zz,&z);
635:   start  = trstarts[thread_id] / 4;
636:   end    = trstarts[thread_id+1] / 4;
637:   ai     = a->i;
638:   for (i=start; i<end; i++) {
639:     n    = ai[i+1] - ai[i];
640:     aj   = a->j + ai[i];
641:     aa   = a->a + ai[i]*16;
642:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
643:     for (j=0; j<n; j++) {
644:       xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
645:       sum1 += aa[16*j]*x1   + aa[16*j+4]*x2 + aa[16*j+8]*x3  + aa[16*j+12]*x4;
646:       sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3  + aa[16*j+13]*x4;
647:       sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
648:       sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
649:     }
650:     z[4*i]   = sum1; z[4*i+1] = sum2;
651:     z[4*i+2] = sum3; z[4*i+3] = sum4;
652:   }
653:   VecRestoreArrayRead(xx,&x);
654:   VecRestoreArray(zz,&z);
655:   return 0;
656: }

660: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
661: {
662:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
663:   PetscScalar       *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
664:   const PetscScalar *x,*xb;
665:   const MatScalar   *v;
666:   PetscErrorCode    ierr;
667:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
668:   PetscBool         usecprow=a->compressedrow.use;


672:   if (usecprow) {
673:     VecGetArrayRead(xx,&x);
674:     VecGetArray(zz,&z);
675:     mbs  = a->compressedrow.nrows;
676:     ii   = a->compressedrow.i;
677:     ridx = a->compressedrow.rindex;
678:     for (i=0; i<mbs; i++) {
679:       n = ii[i+1] - ii[1];
680:       idx  = a->j + ii[i];
681:       v    = a->a + ii[i]*16;
682:       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
683:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
684:       PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
685:       for (j=0; j<n; j++) {
686:         xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
687:         sum1 += v[16*j]*x1   + v[16*j+4]*x2 + v[16*j+8]*x3  + v[16*j+12]*x4;
688:         sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3  + v[16*j+13]*x4;
689:         sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
690:         sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
691:       }
692:       z[4*ridx[i]]   = sum1; z[4*ridx[i]+1] = sum2;
693:       z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
694:     }
695:     VecRestoreArrayRead(xx,&x);
696:     VecRestoreArray(zz,&z);
697:   } else {
698:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);
699:   }
700:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
701:   return(0);
702: }
703: #else
706: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
707: {
708:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
709:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
710:   const PetscScalar *x,*xb;
711:   const MatScalar   *v;
712:   PetscErrorCode    ierr;
713:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
714:   PetscBool         usecprow=a->compressedrow.use;

717:   VecGetArrayRead(xx,&x);
718:   VecGetArray(zz,&zarray);

720:   idx = a->j;
721:   v   = a->a;
722:   if (usecprow) {
723:     mbs  = a->compressedrow.nrows;
724:     ii   = a->compressedrow.i;
725:     ridx = a->compressedrow.rindex;
726:   } else {
727:     mbs = a->mbs;
728:     ii  = a->i;
729:     z   = zarray;
730:   }

732:   for (i=0; i<mbs; i++) {
733:     n = ii[1] - ii[0];
734:     ii++;
735:     sum1 = 0.0;
736:     sum2 = 0.0;
737:     sum3 = 0.0;
738:     sum4 = 0.0;

740:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
741:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
742:     for (j=0; j<n; j++) {
743:       xb    = x + 4*(*idx++);
744:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
745:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
746:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
747:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
748:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
749:       v    += 16;
750:     }
751:     if (usecprow) z = zarray + 4*ridx[i];
752:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
753:     if (!usecprow) z += 4;
754:   }
755:   VecRestoreArrayRead(xx,&x);
756:   VecRestoreArray(zz,&zarray);
757:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
758:   return(0);
759: }
760: #endif

762: #if defined(PETSC_THREADCOMM_ACTIVE)
763: PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
764: {
765:   PetscErrorCode    ierr;
766:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
767:   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
768:   const PetscScalar *x,*xb;
769:   const MatScalar   *aa;
770:   PetscInt          *trstarts=A->rmap->trstarts;
771:   PetscInt          n,start,end,i,j;
772:   const PetscInt    *aj,*ai;

774:   VecGetArrayRead(xx,&x);
775:   VecGetArray(zz,&z);
776:   start  = trstarts[thread_id] / 5;
777:   end    = trstarts[thread_id+1] / 5;
778:   ai     = a->i;
779:   for (i=start; i<end; i++) {
780:     n    = ai[i+1] - ai[i];
781:     aj   = a->j + ai[i];
782:     aa   = a->a + ai[i]*25;
783:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
784:     for (j=0; j<n; j++) {
785:       xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
786:       sum1 += aa[25*j]*x1   + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
787:       sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
788:       sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
789:       sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
790:       sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
791:     }
792:     z[5*i]   = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
793:     z[5*i+3] = sum4; z[5*i+4] = sum5;
794:   }
795:   VecRestoreArrayRead(xx,&x);
796:   VecRestoreArray(zz,&z);
797:   return 0;
798: }

802: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
803: {
804:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
805:   PetscScalar       *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
806:   const PetscScalar *xb,*x;
807:   const MatScalar   *v;
808:   PetscErrorCode    ierr;
809:   const PetscInt    *idx,*ii,*ridx=NULL;
810:   PetscInt          mbs,i,j,n;
811:   PetscBool         usecprow=a->compressedrow.use;


815:   if (usecprow) {
816:     VecGetArrayRead(xx,&x);
817:     VecGetArray(zz,&z);
818:     mbs  = a->compressedrow.nrows;
819:     ii   = a->compressedrow.i;
820:     ridx = a->compressedrow.rindex;
821:     for (i=0; i<mbs; i++) {
822:       n    = ii[i+1] - ii[i];
823:       idx  = a->j + ii[i];
824:       v    = a->a + ii[i]*25;
825:       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
826:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
827:       PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
828:       for (j=0; j<n; j++) {
829:         xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
830:         sum1 += v[25*j]*x1   + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
831:         sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
832:         sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
833:         sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
834:         sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
835:       }
836:       z[5*ridx[i]]   = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
837:       z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
838:     }
839:     VecRestoreArrayRead(xx,&x);
840:     VecRestoreArray(zz,&z);
841:   } else {
842:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);
843:   }
844:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
845:   return(0);
846: }
847: #else
850: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
851: {
852:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
853:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
854:   const PetscScalar *xb,*x;
855:   const MatScalar   *v;
856:   PetscErrorCode    ierr;
857:   const PetscInt    *idx,*ii,*ridx=NULL;
858:   PetscInt          mbs,i,j,n;
859:   PetscBool         usecprow=a->compressedrow.use;

862:   VecGetArrayRead(xx,&x);
863:   VecGetArray(zz,&zarray);

865:   idx = a->j;
866:   v   = a->a;
867:   if (usecprow) {
868:     mbs  = a->compressedrow.nrows;
869:     ii   = a->compressedrow.i;
870:     ridx = a->compressedrow.rindex;
871:   } else {
872:     mbs = a->mbs;
873:     ii  = a->i;
874:     z   = zarray;
875:   }

877:   for (i=0; i<mbs; i++) {
878:     n           = ii[1] - ii[0]; ii++;
879:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
880:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
881:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
882:     for (j=0; j<n; j++) {
883:       xb    = x + 5*(*idx++);
884:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
885:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
886:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
887:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
888:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
889:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
890:       v    += 25;
891:     }
892:     if (usecprow) z = zarray + 5*ridx[i];
893:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
894:     if (!usecprow) z += 5;
895:   }
896:   VecRestoreArrayRead(xx,&x);
897:   VecRestoreArray(zz,&zarray);
898:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
899:   return(0);
900: }
901: #endif


904: #if defined(PETSC_THREADCOMM_ACTIVE)
905: PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
906: {
907:   PetscErrorCode    ierr;
908:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
909:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
910:   const PetscScalar *x,*xb;
911:   const MatScalar   *aa;
912:   PetscInt          *trstarts=A->rmap->trstarts;
913:   PetscInt          n,start,end,i,j;
914:   const PetscInt    *aj,*ai;

916:   VecGetArrayRead(xx,&x);
917:   VecGetArray(zz,&z);
918:   start  = trstarts[thread_id] / 6;
919:   end    = trstarts[thread_id+1] / 6;
920:   ai     = a->i;
921:   for (i=start; i<end; i++) {
922:     n    = ai[i+1] - ai[i];
923:     aj   = a->j + ai[i];
924:     aa   = a->a + ai[i]*36;
925:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
926:     for (j=0; j<n; j++) {
927:       xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
928:       sum1 += aa[36*j]*x1   + aa[36*j+6]*x2  + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
929:       sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2  + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
930:       sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2  + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
931:       sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2  + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
932:       sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
933:       sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
934:     }
935:     z[6*i]   = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
936:     z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
937:   }
938:   VecRestoreArrayRead(xx,&x);
939:   VecRestoreArray(zz,&z);
940:   return 0;
941: }

945: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
946: {
947:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
948:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
949:   const PetscScalar *x,*xb;
950:   const MatScalar   *v;
951:   PetscErrorCode    ierr;
952:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
953:   PetscBool         usecprow=a->compressedrow.use;


957:   if (usecprow) {
958:     VecGetArrayRead(xx,&x);
959:     VecGetArray(zz,&z);
960:     mbs  = a->compressedrow.nrows;
961:     ii   = a->compressedrow.i;
962:     ridx = a->compressedrow.rindex;
963:     for (i=0; i<mbs; i++) {
964:       n  = ii[i+1] - ii[i];
965:       idx  = a->j + ii[i];
966:       v    = a->a + ii[i]*36;
967:       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
968:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
969:       PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
970:       for (j=0; j<n; j++) {
971:         xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
972:         sum1 += v[36*j]*x1   + v[36*j+6]*x2  + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
973:         sum2 += v[36*j+1]*x1 + v[36*j+7]*x2  + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
974:         sum3 += v[36*j+2]*x1 + v[36*j+8]*x2  + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
975:         sum4 += v[36*j+3]*x1 + v[36*j+9]*x2  + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
976:         sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
977:         sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
978:       }
979:       z[6*ridx[i]]   = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
980:       z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
981:     }
982:     VecRestoreArrayRead(xx,&x);
983:     VecRestoreArray(zz,&z);
984:   } else {
985:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);
986:   }
987:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
988:   return(0);
989: }
990: #else
993: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
994: {
995:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
996:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
997:   const PetscScalar *x,*xb;
998:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
999:   const MatScalar   *v;
1000:   PetscErrorCode    ierr;
1001:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1002:   PetscBool         usecprow=a->compressedrow.use;

1005:   VecGetArrayRead(xx,&x);
1006:   VecGetArray(zz,&zarray);

1008:   idx = a->j;
1009:   v   = a->a;
1010:   if (usecprow) {
1011:     mbs  = a->compressedrow.nrows;
1012:     ii   = a->compressedrow.i;
1013:     ridx = a->compressedrow.rindex;
1014:   } else {
1015:     mbs = a->mbs;
1016:     ii  = a->i;
1017:     z   = zarray;
1018:   }

1020:   for (i=0; i<mbs; i++) {
1021:     n  = ii[1] - ii[0];
1022:     ii++;
1023:     sum1 = 0.0;
1024:     sum2 = 0.0;
1025:     sum3 = 0.0;
1026:     sum4 = 0.0;
1027:     sum5 = 0.0;
1028:     sum6 = 0.0;

1030:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1031:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1032:     for (j=0; j<n; j++) {
1033:       xb    = x + 6*(*idx++);
1034:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1035:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1036:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1037:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1038:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1039:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1040:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1041:       v    += 36;
1042:     }
1043:     if (usecprow) z = zarray + 6*ridx[i];
1044:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1045:     if (!usecprow) z += 6;
1046:   }

1048:   VecRestoreArrayRead(xx,&x);
1049:   VecRestoreArray(zz,&zarray);
1050:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
1051:   return(0);
1052: }
1053: #endif

1055: #if defined(PETSC_THREADCOMM_ACTIVE)
1056: PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1057: {
1058:   PetscErrorCode    ierr;
1059:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1060:   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1061:   const PetscScalar *x,*xb;
1062:   const MatScalar   *aa;
1063:   PetscInt          *trstarts=A->rmap->trstarts;
1064:   PetscInt          n,start,end,i,j;
1065:   const PetscInt    *aj,*ai;

1067:   VecGetArrayRead(xx,&x);
1068:   VecGetArray(zz,&z);
1069:   start  = trstarts[thread_id] / 7;
1070:   end    = trstarts[thread_id+1] / 7;
1071:   ai     = a->i;
1072:   for (i=start; i<end; i++) {
1073:     n    = ai[i+1] - ai[i];
1074:     aj   = a->j + ai[i];
1075:     aa   = a->a + ai[i]*49;
1076:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1077:     for (j=0; j<n; j++) {
1078:       xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1079:       sum1 += aa[49*j]*x1   + aa[49*j+7]*x2  + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1080:       sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2  + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1081:       sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2  + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1082:       sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1083:       sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1084:       sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1085:       sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1086:     }
1087:     z[7*i]   = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1088:     z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1089:   }
1090:   VecRestoreArrayRead(xx,&x);
1091:   VecRestoreArray(zz,&z);
1092:   return 0;
1093: }

1097: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1098: {
1099:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1100:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1101:   const PetscScalar *x,*xb;
1102:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1103:   const MatScalar   *v;
1104:   PetscErrorCode    ierr;
1105:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1106:   PetscBool         usecprow=a->compressedrow.use;


1110:   if (usecprow) {
1111:     VecGetArrayRead(xx,&x);
1112:     VecGetArray(zz,&zarray);
1113:     mbs  = a->compressedrow.nrows;
1114:     ii   = a->compressedrow.i;
1115:     ridx = a->compressedrow.rindex;
1116:     for (i=0; i<mbs; i++) {
1117:       n    = ii[i+1] - ii[i];
1118:       idx  = a->j + ii[i];
1119:       v    = a->a + ii[i]*49;
1120:       sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1121:       PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1122:       PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123:       for (j=0; j<n; j++) {
1124:         xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1125:         sum1 += v[49*j]*x1   + v[49*j+7]*x2  + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1126:         sum2 += v[49*j+1]*x1 + v[49*j+8]*x2  + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1127:         sum3 += v[49*j+2]*x1 + v[49*j+9]*x2  + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1128:         sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1129:         sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1130:         sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1131:         sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1132:       }
1133:       z[7*ridx[i]]   = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1134:       z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1135:     }
1136:     VecRestoreArrayRead(xx,&x);
1137:     VecRestoreArray(zz,&zarray);
1138:   } else {
1139:     PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);
1140:   }
1141:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
1142:   return(0);
1143: }
1144: #else
1147: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1148: {
1149:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1150:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1151:   const PetscScalar *x,*xb;
1152:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
1153:   const MatScalar   *v;
1154:   PetscErrorCode    ierr;
1155:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
1156:   PetscBool         usecprow=a->compressedrow.use;

1159:   VecGetArrayRead(xx,&x);
1160:   VecGetArray(zz,&zarray);

1162:   idx = a->j;
1163:   v   = a->a;
1164:   if (usecprow) {
1165:     mbs  = a->compressedrow.nrows;
1166:     ii   = a->compressedrow.i;
1167:     ridx = a->compressedrow.rindex;
1168:   } else {
1169:     mbs = a->mbs;
1170:     ii  = a->i;
1171:     z   = zarray;
1172:   }

1174:   for (i=0; i<mbs; i++) {
1175:     n  = ii[1] - ii[0];
1176:     ii++;
1177:     sum1 = 0.0;
1178:     sum2 = 0.0;
1179:     sum3 = 0.0;
1180:     sum4 = 0.0;
1181:     sum5 = 0.0;
1182:     sum6 = 0.0;
1183:     sum7 = 0.0;

1185:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1186:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1187:     for (j=0; j<n; j++) {
1188:       xb    = x + 7*(*idx++);
1189:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1190:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1191:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1192:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1193:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1194:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1195:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1196:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1197:       v    += 49;
1198:     }
1199:     if (usecprow) z = zarray + 7*ridx[i];
1200:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1201:     if (!usecprow) z += 7;
1202:   }

1204:   VecRestoreArrayRead(xx,&x);
1205:   VecRestoreArray(zz,&zarray);
1206:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
1207:   return(0);
1208: }
1209: #endif

1211: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1212: /* Default MatMult for block size 15 */

1216: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1217: {
1218:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1219:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1220:   const PetscScalar *x,*xb;
1221:   PetscScalar       *zarray,xv;
1222:   const MatScalar   *v;
1223:   PetscErrorCode    ierr;
1224:   const PetscInt    *ii,*ij=a->j,*idx;
1225:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1226:   PetscBool         usecprow=a->compressedrow.use;

1229:   VecGetArrayRead(xx,&x);
1230:   VecGetArray(zz,&zarray);

1232:   v = a->a;
1233:   if (usecprow) {
1234:     mbs  = a->compressedrow.nrows;
1235:     ii   = a->compressedrow.i;
1236:     ridx = a->compressedrow.rindex;
1237:   } else {
1238:     mbs = a->mbs;
1239:     ii  = a->i;
1240:     z   = zarray;
1241:   }

1243:   for (i=0; i<mbs; i++) {
1244:     n    = ii[i+1] - ii[i];
1245:     idx  = ij + ii[i];
1246:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1247:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1249:     for (j=0; j<n; j++) {
1250:       xb = x + 15*(idx[j]);

1252:       for (k=0; k<15; k++) {
1253:         xv     =  xb[k];
1254:         sum1  += v[0]*xv;
1255:         sum2  += v[1]*xv;
1256:         sum3  += v[2]*xv;
1257:         sum4  += v[3]*xv;
1258:         sum5  += v[4]*xv;
1259:         sum6  += v[5]*xv;
1260:         sum7  += v[6]*xv;
1261:         sum8  += v[7]*xv;
1262:         sum9  += v[8]*xv;
1263:         sum10 += v[9]*xv;
1264:         sum11 += v[10]*xv;
1265:         sum12 += v[11]*xv;
1266:         sum13 += v[12]*xv;
1267:         sum14 += v[13]*xv;
1268:         sum15 += v[14]*xv;
1269:         v     += 15;
1270:       }
1271:     }
1272:     if (usecprow) z = zarray + 15*ridx[i];
1273:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1274:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1276:     if (!usecprow) z += 15;
1277:   }

1279:   VecRestoreArrayRead(xx,&x);
1280:   VecRestoreArray(zz,&zarray);
1281:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1282:   return(0);
1283: }

1285: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1288: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1289: {
1290:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1291:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1292:   const PetscScalar *x,*xb;
1293:   PetscScalar       x1,x2,x3,x4,*zarray;
1294:   const MatScalar   *v;
1295:   PetscErrorCode    ierr;
1296:   const PetscInt    *ii,*ij=a->j,*idx;
1297:   PetscInt          mbs,i,j,n,*ridx=NULL;
1298:   PetscBool         usecprow=a->compressedrow.use;

1301:   VecGetArrayRead(xx,&x);
1302:   VecGetArray(zz,&zarray);

1304:   v = a->a;
1305:   if (usecprow) {
1306:     mbs  = a->compressedrow.nrows;
1307:     ii   = a->compressedrow.i;
1308:     ridx = a->compressedrow.rindex;
1309:   } else {
1310:     mbs = a->mbs;
1311:     ii  = a->i;
1312:     z   = zarray;
1313:   }

1315:   for (i=0; i<mbs; i++) {
1316:     n    = ii[i+1] - ii[i];
1317:     idx  = ij + ii[i];
1318:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1319:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1321:     for (j=0; j<n; j++) {
1322:       xb = x + 15*(idx[j]);
1323:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1325:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1326:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1327:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1328:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1329:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1330:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1331:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1332:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1333:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1334:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1335:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1336:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1337:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1338:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1339:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

1341:       v += 60;

1343:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1345:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1346:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1347:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1348:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1349:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1350:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1351:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1352:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1353:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1354:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1355:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1356:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1357:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1358:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1359:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1360:       v     += 60;

1362:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1363:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1364:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1365:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1366:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1367:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1368:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1369:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1370:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1371:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1372:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1373:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1374:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1375:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1376:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1377:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1378:       v     += 60;

1380:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1381:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1382:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1383:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1384:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1385:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1386:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1387:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1388:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1389:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1390:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1391:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1392:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1393:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1394:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1395:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1396:       v     += 45;
1397:     }
1398:     if (usecprow) z = zarray + 15*ridx[i];
1399:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1400:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1402:     if (!usecprow) z += 15;
1403:   }

1405:   VecRestoreArrayRead(xx,&x);
1406:   VecRestoreArray(zz,&zarray);
1407:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1408:   return(0);
1409: }

1411: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1414: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1415: {
1416:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1417:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1418:   const PetscScalar *x,*xb;
1419:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1420:   const MatScalar   *v;
1421:   PetscErrorCode    ierr;
1422:   const PetscInt    *ii,*ij=a->j,*idx;
1423:   PetscInt          mbs,i,j,n,*ridx=NULL;
1424:   PetscBool         usecprow=a->compressedrow.use;

1427:   VecGetArrayRead(xx,&x);
1428:   VecGetArray(zz,&zarray);

1430:   v = a->a;
1431:   if (usecprow) {
1432:     mbs  = a->compressedrow.nrows;
1433:     ii   = a->compressedrow.i;
1434:     ridx = a->compressedrow.rindex;
1435:   } else {
1436:     mbs = a->mbs;
1437:     ii  = a->i;
1438:     z   = zarray;
1439:   }

1441:   for (i=0; i<mbs; i++) {
1442:     n    = ii[i+1] - ii[i];
1443:     idx  = ij + ii[i];
1444:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1445:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1447:     for (j=0; j<n; j++) {
1448:       xb = x + 15*(idx[j]);
1449:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1450:       x8 = xb[7];

1452:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1453:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1454:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1455:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1456:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1457:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1458:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1459:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1460:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1461:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1462:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1463:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1464:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1465:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1466:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1467:       v     += 120;

1469:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1471:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1472:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1473:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1474:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1475:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1476:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1477:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1478:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1479:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1480:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1481:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1482:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1483:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1484:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1485:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1486:       v     += 105;
1487:     }
1488:     if (usecprow) z = zarray + 15*ridx[i];
1489:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1490:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1492:     if (!usecprow) z += 15;
1493:   }

1495:   VecRestoreArrayRead(xx,&x);
1496:   VecRestoreArray(zz,&zarray);
1497:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1498:   return(0);
1499: }

1501: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

1505: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1506: {
1507:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1508:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1509:   const PetscScalar *x,*xb;
1510:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1511:   const MatScalar   *v;
1512:   PetscErrorCode    ierr;
1513:   const PetscInt    *ii,*ij=a->j,*idx;
1514:   PetscInt          mbs,i,j,n,*ridx=NULL;
1515:   PetscBool         usecprow=a->compressedrow.use;

1518:   VecGetArrayRead(xx,&x);
1519:   VecGetArray(zz,&zarray);

1521:   v = a->a;
1522:   if (usecprow) {
1523:     mbs  = a->compressedrow.nrows;
1524:     ii   = a->compressedrow.i;
1525:     ridx = a->compressedrow.rindex;
1526:   } else {
1527:     mbs = a->mbs;
1528:     ii  = a->i;
1529:     z   = zarray;
1530:   }

1532:   for (i=0; i<mbs; i++) {
1533:     n    = ii[i+1] - ii[i];
1534:     idx  = ij + ii[i];
1535:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1536:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1538:     for (j=0; j<n; j++) {
1539:       xb = x + 15*(idx[j]);
1540:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1541:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1543:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1544:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1545:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1546:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1547:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1548:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1549:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1550:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1551:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1552:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1553:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1554:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1555:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1556:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1557:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1558:       v     += 225;
1559:     }
1560:     if (usecprow) z = zarray + 15*ridx[i];
1561:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1562:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1564:     if (!usecprow) z += 15;
1565:   }

1567:   VecRestoreArrayRead(xx,&x);
1568:   VecRestoreArray(zz,&zarray);
1569:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1570:   return(0);
1571: }


1574: /*
1575:     This will not work with MatScalar == float because it calls the BLAS
1576: */
1579: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1580: {
1581:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1582:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1583:   MatScalar      *v;
1585:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1586:   PetscInt       ncols,k,*ridx=NULL;
1587:   PetscBool      usecprow=a->compressedrow.use;

1590:   VecGetArray(xx,&x);
1591:   VecGetArray(zz,&zarray);

1593:   idx = a->j;
1594:   v   = a->a;
1595:   if (usecprow) {
1596:     mbs  = a->compressedrow.nrows;
1597:     ii   = a->compressedrow.i;
1598:     ridx = a->compressedrow.rindex;
1599:   } else {
1600:     mbs = a->mbs;
1601:     ii  = a->i;
1602:     z   = zarray;
1603:   }

1605:   if (!a->mult_work) {
1606:     k    = PetscMax(A->rmap->n,A->cmap->n);
1607:     PetscMalloc1((k+1),&a->mult_work);
1608:   }
1609:   work = a->mult_work;
1610:   for (i=0; i<mbs; i++) {
1611:     n           = ii[1] - ii[0]; ii++;
1612:     ncols       = n*bs;
1613:     workt       = work;
1614:     for (j=0; j<n; j++) {
1615:       xb = x + bs*(*idx++);
1616:       for (k=0; k<bs; k++) workt[k] = xb[k];
1617:       workt += bs;
1618:     }
1619:     if (usecprow) z = zarray + bs*ridx[i];
1620:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1621:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1622:     v += n*bs2;
1623:     if (!usecprow) z += bs;
1624:   }
1625:   VecRestoreArray(xx,&x);
1626:   VecRestoreArray(zz,&zarray);
1627:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1628:   return(0);
1629: }

1633: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1634: {
1635:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1636:   const PetscScalar *x;
1637:   PetscScalar       *y,*z,sum;
1638:   const MatScalar   *v;
1639:   PetscErrorCode    ierr;
1640:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1641:   const PetscInt    *idx,*ii;
1642:   PetscBool         usecprow=a->compressedrow.use;

1645:   VecGetArrayRead(xx,&x);
1646:   VecGetArray(yy,&y);
1647:   if (zz != yy) {
1648:     VecGetArray(zz,&z);
1649:   } else {
1650:     z = y;
1651:   }

1653:   idx = a->j;
1654:   v   = a->a;
1655:   if (usecprow) {
1656:     if (zz != yy) {
1657:       PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1658:     }
1659:     mbs  = a->compressedrow.nrows;
1660:     ii   = a->compressedrow.i;
1661:     ridx = a->compressedrow.rindex;
1662:   } else {
1663:     ii = a->i;
1664:   }

1666:   for (i=0; i<mbs; i++) {
1667:     n = ii[1] - ii[0];
1668:     ii++;
1669:     if (!usecprow) {
1670:       sum         = y[i];
1671:     } else {
1672:       sum = y[ridx[i]];
1673:     }
1674:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1675:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1676:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1677:     v   += n;
1678:     idx += n;
1679:     if (usecprow) {
1680:       z[ridx[i]] = sum;
1681:     } else {
1682:       z[i] = sum;
1683:     }
1684:   }
1685:   VecRestoreArrayRead(xx,&x);
1686:   VecRestoreArray(yy,&y);
1687:   if (zz != yy) {
1688:     VecRestoreArray(zz,&z);
1689:   }
1690:   PetscLogFlops(2.0*a->nz);
1691:   return(0);
1692: }

1696: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1697: {
1698:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1699:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1700:   PetscScalar    x1,x2,*yarray,*zarray;
1701:   MatScalar      *v;
1703:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1704:   PetscBool      usecprow=a->compressedrow.use;

1707:   VecGetArray(xx,&x);
1708:   VecGetArray(yy,&yarray);
1709:   if (zz != yy) {
1710:     VecGetArray(zz,&zarray);
1711:   } else {
1712:     zarray = yarray;
1713:   }

1715:   idx = a->j;
1716:   v   = a->a;
1717:   if (usecprow) {
1718:     if (zz != yy) {
1719:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1720:     }
1721:     mbs  = a->compressedrow.nrows;
1722:     ii   = a->compressedrow.i;
1723:     ridx = a->compressedrow.rindex;
1724:     if (zz != yy) {
1725:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1726:     }
1727:   } else {
1728:     ii = a->i;
1729:     y  = yarray;
1730:     z  = zarray;
1731:   }

1733:   for (i=0; i<mbs; i++) {
1734:     n = ii[1] - ii[0]; ii++;
1735:     if (usecprow) {
1736:       z = zarray + 2*ridx[i];
1737:       y = yarray + 2*ridx[i];
1738:     }
1739:     sum1 = y[0]; sum2 = y[1];
1740:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1741:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1742:     for (j=0; j<n; j++) {
1743:       xb = x + 2*(*idx++);
1744:       x1 = xb[0];
1745:       x2 = xb[1];

1747:       sum1 += v[0]*x1 + v[2]*x2;
1748:       sum2 += v[1]*x1 + v[3]*x2;
1749:       v    += 4;
1750:     }
1751:     z[0] = sum1; z[1] = sum2;
1752:     if (!usecprow) {
1753:       z += 2; y += 2;
1754:     }
1755:   }
1756:   VecRestoreArray(xx,&x);
1757:   VecRestoreArray(yy,&yarray);
1758:   if (zz != yy) {
1759:     VecRestoreArray(zz,&zarray);
1760:   }
1761:   PetscLogFlops(4.0*a->nz);
1762:   return(0);
1763: }

1767: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1768: {
1769:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1770:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1771:   MatScalar      *v;
1773:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1774:   PetscBool      usecprow=a->compressedrow.use;

1777:   VecGetArray(xx,&x);
1778:   VecGetArray(yy,&yarray);
1779:   if (zz != yy) {
1780:     VecGetArray(zz,&zarray);
1781:   } else {
1782:     zarray = yarray;
1783:   }

1785:   idx = a->j;
1786:   v   = a->a;
1787:   if (usecprow) {
1788:     if (zz != yy) {
1789:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1790:     }
1791:     mbs  = a->compressedrow.nrows;
1792:     ii   = a->compressedrow.i;
1793:     ridx = a->compressedrow.rindex;
1794:   } else {
1795:     ii = a->i;
1796:     y  = yarray;
1797:     z  = zarray;
1798:   }

1800:   for (i=0; i<mbs; i++) {
1801:     n = ii[1] - ii[0]; ii++;
1802:     if (usecprow) {
1803:       z = zarray + 3*ridx[i];
1804:       y = yarray + 3*ridx[i];
1805:     }
1806:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1807:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1808:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1809:     for (j=0; j<n; j++) {
1810:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1811:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1812:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1813:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1814:       v    += 9;
1815:     }
1816:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1817:     if (!usecprow) {
1818:       z += 3; y += 3;
1819:     }
1820:   }
1821:   VecRestoreArray(xx,&x);
1822:   VecRestoreArray(yy,&yarray);
1823:   if (zz != yy) {
1824:     VecRestoreArray(zz,&zarray);
1825:   }
1826:   PetscLogFlops(18.0*a->nz);
1827:   return(0);
1828: }

1832: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1833: {
1834:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1835:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1836:   MatScalar      *v;
1838:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1839:   PetscBool      usecprow=a->compressedrow.use;

1842:   VecGetArray(xx,&x);
1843:   VecGetArray(yy,&yarray);
1844:   if (zz != yy) {
1845:     VecGetArray(zz,&zarray);
1846:   } else {
1847:     zarray = yarray;
1848:   }

1850:   idx = a->j;
1851:   v   = a->a;
1852:   if (usecprow) {
1853:     if (zz != yy) {
1854:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1855:     }
1856:     mbs  = a->compressedrow.nrows;
1857:     ii   = a->compressedrow.i;
1858:     ridx = a->compressedrow.rindex;
1859:   } else {
1860:     ii = a->i;
1861:     y  = yarray;
1862:     z  = zarray;
1863:   }

1865:   for (i=0; i<mbs; i++) {
1866:     n = ii[1] - ii[0]; ii++;
1867:     if (usecprow) {
1868:       z = zarray + 4*ridx[i];
1869:       y = yarray + 4*ridx[i];
1870:     }
1871:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1872:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1873:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1874:     for (j=0; j<n; j++) {
1875:       xb    = x + 4*(*idx++);
1876:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1877:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1878:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1879:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1880:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1881:       v    += 16;
1882:     }
1883:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1884:     if (!usecprow) {
1885:       z += 4; y += 4;
1886:     }
1887:   }
1888:   VecRestoreArray(xx,&x);
1889:   VecRestoreArray(yy,&yarray);
1890:   if (zz != yy) {
1891:     VecRestoreArray(zz,&zarray);
1892:   }
1893:   PetscLogFlops(32.0*a->nz);
1894:   return(0);
1895: }

1899: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1900: {
1901:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1902:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1903:   PetscScalar    *yarray,*zarray;
1904:   MatScalar      *v;
1906:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1907:   PetscBool      usecprow=a->compressedrow.use;

1910:   VecGetArray(xx,&x);
1911:   VecGetArray(yy,&yarray);
1912:   if (zz != yy) {
1913:     VecGetArray(zz,&zarray);
1914:   } else {
1915:     zarray = yarray;
1916:   }

1918:   idx = a->j;
1919:   v   = a->a;
1920:   if (usecprow) {
1921:     if (zz != yy) {
1922:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1923:     }
1924:     mbs  = a->compressedrow.nrows;
1925:     ii   = a->compressedrow.i;
1926:     ridx = a->compressedrow.rindex;
1927:   } else {
1928:     ii = a->i;
1929:     y  = yarray;
1930:     z  = zarray;
1931:   }

1933:   for (i=0; i<mbs; i++) {
1934:     n = ii[1] - ii[0]; ii++;
1935:     if (usecprow) {
1936:       z = zarray + 5*ridx[i];
1937:       y = yarray + 5*ridx[i];
1938:     }
1939:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1940:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1941:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1942:     for (j=0; j<n; j++) {
1943:       xb    = x + 5*(*idx++);
1944:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1945:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1946:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1947:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1948:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1949:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1950:       v    += 25;
1951:     }
1952:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1953:     if (!usecprow) {
1954:       z += 5; y += 5;
1955:     }
1956:   }
1957:   VecRestoreArray(xx,&x);
1958:   VecRestoreArray(yy,&yarray);
1959:   if (zz != yy) {
1960:     VecRestoreArray(zz,&zarray);
1961:   }
1962:   PetscLogFlops(50.0*a->nz);
1963:   return(0);
1964: }
1967: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1968: {
1969:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1970:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1971:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1972:   MatScalar      *v;
1974:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1975:   PetscBool      usecprow=a->compressedrow.use;

1978:   VecGetArray(xx,&x);
1979:   VecGetArray(yy,&yarray);
1980:   if (zz != yy) {
1981:     VecGetArray(zz,&zarray);
1982:   } else {
1983:     zarray = yarray;
1984:   }

1986:   idx = a->j;
1987:   v   = a->a;
1988:   if (usecprow) {
1989:     if (zz != yy) {
1990:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1991:     }
1992:     mbs  = a->compressedrow.nrows;
1993:     ii   = a->compressedrow.i;
1994:     ridx = a->compressedrow.rindex;
1995:   } else {
1996:     ii = a->i;
1997:     y  = yarray;
1998:     z  = zarray;
1999:   }

2001:   for (i=0; i<mbs; i++) {
2002:     n = ii[1] - ii[0]; ii++;
2003:     if (usecprow) {
2004:       z = zarray + 6*ridx[i];
2005:       y = yarray + 6*ridx[i];
2006:     }
2007:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2008:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2009:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2010:     for (j=0; j<n; j++) {
2011:       xb    = x + 6*(*idx++);
2012:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2013:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2014:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2015:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2016:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2017:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2018:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2019:       v    += 36;
2020:     }
2021:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2022:     if (!usecprow) {
2023:       z += 6; y += 6;
2024:     }
2025:   }
2026:   VecRestoreArray(xx,&x);
2027:   VecRestoreArray(yy,&yarray);
2028:   if (zz != yy) {
2029:     VecRestoreArray(zz,&zarray);
2030:   }
2031:   PetscLogFlops(72.0*a->nz);
2032:   return(0);
2033: }

2037: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2038: {
2039:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2040:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2041:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2042:   MatScalar      *v;
2044:   PetscInt       mbs     =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
2045:   PetscBool      usecprow=a->compressedrow.use;

2048:   VecGetArray(xx,&x);
2049:   VecGetArray(yy,&yarray);
2050:   if (zz != yy) {
2051:     VecGetArray(zz,&zarray);
2052:   } else {
2053:     zarray = yarray;
2054:   }

2056:   idx = a->j;
2057:   v   = a->a;
2058:   if (usecprow) {
2059:     if (zz != yy) {
2060:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
2061:     }
2062:     mbs  = a->compressedrow.nrows;
2063:     ii   = a->compressedrow.i;
2064:     ridx = a->compressedrow.rindex;
2065:   } else {
2066:     ii = a->i;
2067:     y  = yarray;
2068:     z  = zarray;
2069:   }

2071:   for (i=0; i<mbs; i++) {
2072:     n = ii[1] - ii[0]; ii++;
2073:     if (usecprow) {
2074:       z = zarray + 7*ridx[i];
2075:       y = yarray + 7*ridx[i];
2076:     }
2077:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2078:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2079:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2080:     for (j=0; j<n; j++) {
2081:       xb    = x + 7*(*idx++);
2082:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2083:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2084:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2085:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2086:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2087:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2088:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2089:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2090:       v    += 49;
2091:     }
2092:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2093:     if (!usecprow) {
2094:       z += 7; y += 7;
2095:     }
2096:   }
2097:   VecRestoreArray(xx,&x);
2098:   VecRestoreArray(yy,&yarray);
2099:   if (zz != yy) {
2100:     VecRestoreArray(zz,&zarray);
2101:   }
2102:   PetscLogFlops(98.0*a->nz);
2103:   return(0);
2104: }

2108: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2109: {
2110:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2111:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
2112:   MatScalar      *v;
2114:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
2115:   PetscInt       ncols,k,*ridx=NULL;
2116:   PetscBool      usecprow=a->compressedrow.use;

2119:   VecCopy(yy,zz);
2120:   VecGetArray(xx,&x);
2121:   VecGetArray(zz,&zarray);

2123:   idx = a->j;
2124:   v   = a->a;
2125:   if (usecprow) {
2126:     mbs  = a->compressedrow.nrows;
2127:     ii   = a->compressedrow.i;
2128:     ridx = a->compressedrow.rindex;
2129:   } else {
2130:     mbs = a->mbs;
2131:     ii  = a->i;
2132:     z   = zarray;
2133:   }

2135:   if (!a->mult_work) {
2136:     k    = PetscMax(A->rmap->n,A->cmap->n);
2137:     PetscMalloc1((k+1),&a->mult_work);
2138:   }
2139:   work = a->mult_work;
2140:   for (i=0; i<mbs; i++) {
2141:     n     = ii[1] - ii[0]; ii++;
2142:     ncols = n*bs;
2143:     workt = work;
2144:     for (j=0; j<n; j++) {
2145:       xb = x + bs*(*idx++);
2146:       for (k=0; k<bs; k++) workt[k] = xb[k];
2147:       workt += bs;
2148:     }
2149:     if (usecprow) z = zarray + bs*ridx[i];
2150:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2151:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2152:     v += n*bs2;
2153:     if (!usecprow) z += bs;
2154:   }
2155:   VecRestoreArray(xx,&x);
2156:   VecRestoreArray(zz,&zarray);
2157:   PetscLogFlops(2.0*a->nz*bs2);
2158:   return(0);
2159: }

2163: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2164: {
2165:   PetscScalar    zero = 0.0;

2169:   VecSet(zz,zero);
2170:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2171:   return(0);
2172: }

2176: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2177: {
2178:   PetscScalar    zero = 0.0;

2182:   VecSet(zz,zero);
2183:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2184:   return(0);
2185: }

2189: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2190: {
2191:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2192:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2193:   MatScalar         *v;
2194:   PetscErrorCode    ierr;
2195:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2196:   Mat_CompressedRow cprow   = a->compressedrow;
2197:   PetscBool         usecprow=cprow.use;

2200:   if (yy != zz) { VecCopy(yy,zz); }
2201:   VecGetArray(xx,&x);
2202:   VecGetArray(zz,&z);

2204:   idx = a->j;
2205:   v   = a->a;
2206:   if (usecprow) {
2207:     mbs  = cprow.nrows;
2208:     ii   = cprow.i;
2209:     ridx = cprow.rindex;
2210:   } else {
2211:     mbs=a->mbs;
2212:     ii = a->i;
2213:     xb = x;
2214:   }

2216:   switch (bs) {
2217:   case 1:
2218:     for (i=0; i<mbs; i++) {
2219:       if (usecprow) xb = x + ridx[i];
2220:       x1 = xb[0];
2221:       ib = idx + ii[0];
2222:       n  = ii[1] - ii[0]; ii++;
2223:       for (j=0; j<n; j++) {
2224:         rval     = ib[j];
2225:         z[rval] += PetscConj(*v) * x1;
2226:         v++;
2227:       }
2228:       if (!usecprow) xb++;
2229:     }
2230:     break;
2231:   case 2:
2232:     for (i=0; i<mbs; i++) {
2233:       if (usecprow) xb = x + 2*ridx[i];
2234:       x1 = xb[0]; x2 = xb[1];
2235:       ib = idx + ii[0];
2236:       n  = ii[1] - ii[0]; ii++;
2237:       for (j=0; j<n; j++) {
2238:         rval       = ib[j]*2;
2239:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2240:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2241:         v         += 4;
2242:       }
2243:       if (!usecprow) xb += 2;
2244:     }
2245:     break;
2246:   case 3:
2247:     for (i=0; i<mbs; i++) {
2248:       if (usecprow) xb = x + 3*ridx[i];
2249:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2250:       ib = idx + ii[0];
2251:       n  = ii[1] - ii[0]; ii++;
2252:       for (j=0; j<n; j++) {
2253:         rval       = ib[j]*3;
2254:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2255:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2256:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2257:         v         += 9;
2258:       }
2259:       if (!usecprow) xb += 3;
2260:     }
2261:     break;
2262:   case 4:
2263:     for (i=0; i<mbs; i++) {
2264:       if (usecprow) xb = x + 4*ridx[i];
2265:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2266:       ib = idx + ii[0];
2267:       n  = ii[1] - ii[0]; ii++;
2268:       for (j=0; j<n; j++) {
2269:         rval       = ib[j]*4;
2270:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2271:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2272:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2273:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2274:         v         += 16;
2275:       }
2276:       if (!usecprow) xb += 4;
2277:     }
2278:     break;
2279:   case 5:
2280:     for (i=0; i<mbs; i++) {
2281:       if (usecprow) xb = x + 5*ridx[i];
2282:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2283:       x4 = xb[3]; x5 = xb[4];
2284:       ib = idx + ii[0];
2285:       n  = ii[1] - ii[0]; ii++;
2286:       for (j=0; j<n; j++) {
2287:         rval       = ib[j]*5;
2288:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2289:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2290:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2291:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2292:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2293:         v         += 25;
2294:       }
2295:       if (!usecprow) xb += 5;
2296:     }
2297:     break;
2298:   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
2299:     PetscInt    ncols,k;
2300:     PetscScalar *work,*workt,*xtmp;

2302:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2303:     if (!a->mult_work) {
2304:       k    = PetscMax(A->rmap->n,A->cmap->n);
2305:       PetscMalloc1((k+1),&a->mult_work);
2306:     }
2307:     work = a->mult_work;
2308:     xtmp = x;
2309:     for (i=0; i<mbs; i++) {
2310:       n     = ii[1] - ii[0]; ii++;
2311:       ncols = n*bs;
2312:       PetscMemzero(work,ncols*sizeof(PetscScalar));
2313:       if (usecprow) xtmp = x + bs*ridx[i];
2314:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2315:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2316:       v += n*bs2;
2317:       if (!usecprow) xtmp += bs;
2318:       workt = work;
2319:       for (j=0; j<n; j++) {
2320:         zb = z + bs*(*idx++);
2321:         for (k=0; k<bs; k++) zb[k] += workt[k] ;
2322:         workt += bs;
2323:       }
2324:     }
2325:     }
2326:   }
2327:   VecRestoreArray(xx,&x);
2328:   VecRestoreArray(zz,&z);
2329:   PetscLogFlops(2.0*a->nz*a->bs2);
2330:   return(0);
2331: }

2335: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2336: {
2337:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2338:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2339:   MatScalar         *v;
2340:   PetscErrorCode    ierr;
2341:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2342:   Mat_CompressedRow cprow   = a->compressedrow;
2343:   PetscBool         usecprow=cprow.use;

2346:   if (yy != zz) { VecCopy(yy,zz); }
2347:   VecGetArray(xx,&x);
2348:   VecGetArray(zz,&z);

2350:   idx = a->j;
2351:   v   = a->a;
2352:   if (usecprow) {
2353:     mbs  = cprow.nrows;
2354:     ii   = cprow.i;
2355:     ridx = cprow.rindex;
2356:   } else {
2357:     mbs=a->mbs;
2358:     ii = a->i;
2359:     xb = x;
2360:   }

2362:   switch (bs) {
2363:   case 1:
2364:     for (i=0; i<mbs; i++) {
2365:       if (usecprow) xb = x + ridx[i];
2366:       x1 = xb[0];
2367:       ib = idx + ii[0];
2368:       n  = ii[1] - ii[0]; ii++;
2369:       for (j=0; j<n; j++) {
2370:         rval     = ib[j];
2371:         z[rval] += *v * x1;
2372:         v++;
2373:       }
2374:       if (!usecprow) xb++;
2375:     }
2376:     break;
2377:   case 2:
2378:     for (i=0; i<mbs; i++) {
2379:       if (usecprow) xb = x + 2*ridx[i];
2380:       x1 = xb[0]; x2 = xb[1];
2381:       ib = idx + ii[0];
2382:       n  = ii[1] - ii[0]; ii++;
2383:       for (j=0; j<n; j++) {
2384:         rval       = ib[j]*2;
2385:         z[rval++] += v[0]*x1 + v[1]*x2;
2386:         z[rval++] += v[2]*x1 + v[3]*x2;
2387:         v         += 4;
2388:       }
2389:       if (!usecprow) xb += 2;
2390:     }
2391:     break;
2392:   case 3:
2393:     for (i=0; i<mbs; i++) {
2394:       if (usecprow) xb = x + 3*ridx[i];
2395:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2396:       ib = idx + ii[0];
2397:       n  = ii[1] - ii[0]; ii++;
2398:       for (j=0; j<n; j++) {
2399:         rval       = ib[j]*3;
2400:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2401:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2402:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2403:         v         += 9;
2404:       }
2405:       if (!usecprow) xb += 3;
2406:     }
2407:     break;
2408:   case 4:
2409:     for (i=0; i<mbs; i++) {
2410:       if (usecprow) xb = x + 4*ridx[i];
2411:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2412:       ib = idx + ii[0];
2413:       n  = ii[1] - ii[0]; ii++;
2414:       for (j=0; j<n; j++) {
2415:         rval       = ib[j]*4;
2416:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2417:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2418:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2419:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2420:         v         += 16;
2421:       }
2422:       if (!usecprow) xb += 4;
2423:     }
2424:     break;
2425:   case 5:
2426:     for (i=0; i<mbs; i++) {
2427:       if (usecprow) xb = x + 5*ridx[i];
2428:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2429:       x4 = xb[3]; x5 = xb[4];
2430:       ib = idx + ii[0];
2431:       n  = ii[1] - ii[0]; ii++;
2432:       for (j=0; j<n; j++) {
2433:         rval       = ib[j]*5;
2434:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2435:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2436:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2437:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2438:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2439:         v         += 25;
2440:       }
2441:       if (!usecprow) xb += 5;
2442:     }
2443:     break;
2444:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2445:     PetscInt    ncols,k;
2446:     PetscScalar *work,*workt,*xtmp;

2448:     if (!a->mult_work) {
2449:       k    = PetscMax(A->rmap->n,A->cmap->n);
2450:       PetscMalloc1((k+1),&a->mult_work);
2451:     }
2452:     work = a->mult_work;
2453:     xtmp = x;
2454:     for (i=0; i<mbs; i++) {
2455:       n     = ii[1] - ii[0]; ii++;
2456:       ncols = n*bs;
2457:       PetscMemzero(work,ncols*sizeof(PetscScalar));
2458:       if (usecprow) xtmp = x + bs*ridx[i];
2459:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2460:       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2461:       v += n*bs2;
2462:       if (!usecprow) xtmp += bs;
2463:       workt = work;
2464:       for (j=0; j<n; j++) {
2465:         zb = z + bs*(*idx++);
2466:         for (k=0; k<bs; k++) zb[k] += workt[k];
2467:         workt += bs;
2468:       }
2469:     }
2470:     }
2471:   }
2472:   VecRestoreArray(xx,&x);
2473:   VecRestoreArray(zz,&z);
2474:   PetscLogFlops(2.0*a->nz*a->bs2);
2475:   return(0);
2476: }

2480: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2481: {
2482:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2483:   PetscInt       totalnz = a->bs2*a->nz;
2484:   PetscScalar    oalpha  = alpha;
2486:   PetscBLASInt   one = 1,tnz;

2489:   PetscBLASIntCast(totalnz,&tnz);
2490:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2491:   PetscLogFlops(totalnz);
2492:   return(0);
2493: }

2497: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2498: {
2500:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2501:   MatScalar      *v  = a->a;
2502:   PetscReal      sum = 0.0;
2503:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2506:   if (type == NORM_FROBENIUS) {
2507:     for (i=0; i< bs2*nz; i++) {
2508:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2509:     }
2510:     *norm = PetscSqrtReal(sum);
2511:   } else if (type == NORM_1) { /* maximum column sum */
2512:     PetscReal *tmp;
2513:     PetscInt  *bcol = a->j;
2514:     PetscCalloc1((A->cmap->n+1),&tmp);
2515:     for (i=0; i<nz; i++) {
2516:       for (j=0; j<bs; j++) {
2517:         k1 = bs*(*bcol) + j; /* column index */
2518:         for (k=0; k<bs; k++) {
2519:           tmp[k1] += PetscAbsScalar(*v); v++;
2520:         }
2521:       }
2522:       bcol++;
2523:     }
2524:     *norm = 0.0;
2525:     for (j=0; j<A->cmap->n; j++) {
2526:       if (tmp[j] > *norm) *norm = tmp[j];
2527:     }
2528:     PetscFree(tmp);
2529:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2530:     *norm = 0.0;
2531:     for (k=0; k<bs; k++) {
2532:       for (j=0; j<a->mbs; j++) {
2533:         v   = a->a + bs2*a->i[j] + k;
2534:         sum = 0.0;
2535:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2536:           for (k1=0; k1<bs; k1++) {
2537:             sum += PetscAbsScalar(*v);
2538:             v   += bs;
2539:           }
2540:         }
2541:         if (sum > *norm) *norm = sum;
2542:       }
2543:     }
2544:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2545:   return(0);
2546: }


2551: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2552: {
2553:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2557:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2558:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2559:     *flg = PETSC_FALSE;
2560:     return(0);
2561:   }

2563:   /* if the a->i are the same */
2564:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2565:   if (!*flg) return(0);

2567:   /* if a->j are the same */
2568:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2569:   if (!*flg) return(0);

2571:   /* if a->a are the same */
2572:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2573:   return(0);

2575: }

2579: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2580: {
2581:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2583:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2584:   PetscScalar    *x,zero = 0.0;
2585:   MatScalar      *aa,*aa_j;

2588:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2589:   bs   = A->rmap->bs;
2590:   aa   = a->a;
2591:   ai   = a->i;
2592:   aj   = a->j;
2593:   ambs = a->mbs;
2594:   bs2  = a->bs2;

2596:   VecSet(v,zero);
2597:   VecGetArray(v,&x);
2598:   VecGetLocalSize(v,&n);
2599:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2600:   for (i=0; i<ambs; i++) {
2601:     for (j=ai[i]; j<ai[i+1]; j++) {
2602:       if (aj[j] == i) {
2603:         row  = i*bs;
2604:         aa_j = aa+j*bs2;
2605:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2606:         break;
2607:       }
2608:     }
2609:   }
2610:   VecRestoreArray(v,&x);
2611:   return(0);
2612: }

2616: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2617: {
2618:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2619:   const PetscScalar *l,*r,*li,*ri;
2620:   PetscScalar       x;
2621:   MatScalar         *aa, *v;
2622:   PetscErrorCode    ierr;
2623:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2624:   const PetscInt    *ai,*aj;

2627:   ai  = a->i;
2628:   aj  = a->j;
2629:   aa  = a->a;
2630:   m   = A->rmap->n;
2631:   n   = A->cmap->n;
2632:   bs  = A->rmap->bs;
2633:   mbs = a->mbs;
2634:   bs2 = a->bs2;
2635:   if (ll) {
2636:     VecGetArrayRead(ll,&l);
2637:     VecGetLocalSize(ll,&lm);
2638:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2639:     for (i=0; i<mbs; i++) { /* for each block row */
2640:       M  = ai[i+1] - ai[i];
2641:       li = l + i*bs;
2642:       v  = aa + bs2*ai[i];
2643:       for (j=0; j<M; j++) { /* for each block */
2644:         for (k=0; k<bs2; k++) {
2645:           (*v++) *= li[k%bs];
2646:         }
2647:       }
2648:     }
2649:     VecRestoreArrayRead(ll,&l);
2650:     PetscLogFlops(a->nz);
2651:   }

2653:   if (rr) {
2654:     VecGetArrayRead(rr,&r);
2655:     VecGetLocalSize(rr,&rn);
2656:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2657:     for (i=0; i<mbs; i++) { /* for each block row */
2658:       iai = ai[i];
2659:       M   = ai[i+1] - iai;
2660:       v   = aa + bs2*iai;
2661:       for (j=0; j<M; j++) { /* for each block */
2662:         ri = r + bs*aj[iai+j];
2663:         for (k=0; k<bs; k++) {
2664:           x = ri[k];
2665:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2666:           v += bs;
2667:         }
2668:       }
2669:     }
2670:     VecRestoreArrayRead(rr,&r);
2671:     PetscLogFlops(a->nz);
2672:   }
2673:   return(0);
2674: }


2679: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2680: {
2681:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2684:   info->block_size   = a->bs2;
2685:   info->nz_allocated = a->bs2*a->maxnz;
2686:   info->nz_used      = a->bs2*a->nz;
2687:   info->nz_unneeded  = (double)(info->nz_allocated - info->nz_used);
2688:   info->assemblies   = A->num_ass;
2689:   info->mallocs      = A->info.mallocs;
2690:   info->memory       = ((PetscObject)A)->mem;
2691:   if (A->factortype) {
2692:     info->fill_ratio_given  = A->info.fill_ratio_given;
2693:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2694:     info->factor_mallocs    = A->info.factor_mallocs;
2695:   } else {
2696:     info->fill_ratio_given  = 0;
2697:     info->fill_ratio_needed = 0;
2698:     info->factor_mallocs    = 0;
2699:   }
2700:   return(0);
2701: }


2704: #if defined(PETSC_THREADCOMM_ACTIVE)
2705: PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2706: {
2708:   PetscInt       *trstarts=A->rmap->trstarts;
2709:   PetscInt       n,start,end;
2710:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;

2712:   start = trstarts[thread_id];
2713:   end   = trstarts[thread_id+1];
2714:   n     = a->i[end] - a->i[start];
2715:   PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(PetscScalar));
2716:   return 0;
2717: }

2721: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2722: {

2725: 
2727:   PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);
2728:   return(0);
2729: }
2730: #else
2733: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2734: {
2735:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2739:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2740:   return(0);
2741: }
2742: #endif