Actual source code: baij2.c

  1: /*$Id: baij2.c,v 1.72 2001/03/23 23:22:07 balay Exp $*/

 3:  #include petscsys.h
 4:  #include src/mat/impls/baij/seq/baij.h
 5:  #include src/vec/vecimpl.h
 6:  #include src/inline/spops.h
 7:  #include src/inline/ilu.h
 8:  #include petscbt.h

 10: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
 11: {
 12:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
 13:   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
 14:   int         start,end,*ai,*aj,bs,*nidx2;
 15:   PetscBT     table;

 18:   m     = a->mbs;
 19:   ai    = a->i;
 20:   aj    = a->j;
 21:   bs    = a->bs;

 23:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 25:   PetscBTCreate(m,table);
 26:   PetscMalloc((m+1)*sizeof(int),&nidx);
 27:   PetscMalloc((A->m+1)*sizeof(int),&nidx2);

 29:   for (i=0; i<is_max; i++) {
 30:     /* Initialise the two local arrays */
 31:     isz  = 0;
 32:     PetscBTMemzero(m,table);
 33: 
 34:     /* Extract the indices, assume there can be duplicate entries */
 35:     ISGetIndices(is[i],&idx);
 36:     ISGetLocalSize(is[i],&n);

 38:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 39:     for (j=0; j<n ; ++j){
 40:       ival = idx[j]/bs; /* convert the indices into block indices */
 41:       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 42:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 43:     }
 44:     ISRestoreIndices(is[i],&idx);
 45:     ISDestroy(is[i]);
 46: 
 47:     k = 0;
 48:     for (j=0; j<ov; j++){ /* for each overlap*/
 49:       n = isz;
 50:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 51:         row   = nidx[k];
 52:         start = ai[row];
 53:         end   = ai[row+1];
 54:         for (l = start; l<end ; l++){
 55:           val = aj[l];
 56:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 57:         }
 58:       }
 59:     }
 60:     /* expand the Index Set */
 61:     for (j=0; j<isz; j++) {
 62:       for (k=0; k<bs; k++)
 63:         nidx2[j*bs+k] = nidx[j]*bs+k;
 64:     }
 65:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 66:   }
 67:   PetscBTDestroy(table);
 68:   PetscFree(nidx);
 69:   PetscFree(nidx2);
 70:   return(0);
 71: }

 73: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
 74: {
 75:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*c;
 76:   int          *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
 77:   int          row,mat_i,*mat_j,tcol,*mat_ilen;
 78:   int          *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
 79:   int          *aj = a->j,*ai = a->i;
 80:   MatScalar    *mat_a;
 81:   Mat          C;
 82:   PetscTruth   flag;

 85:   ISSorted(iscol,(PetscTruth*)&i);
 86:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 88:   ISGetIndices(isrow,&irow);
 89:   ISGetIndices(iscol,&icol);
 90:   ISGetLocalSize(isrow,&nrows);
 91:   ISGetLocalSize(iscol,&ncols);

 93:   PetscMalloc((1+oldcols)*sizeof(int),&smap);
 94:   ssmap = smap;
 95:   PetscMalloc((1+nrows)*sizeof(int),&lens);
 96:   ierr  = PetscMemzero(smap,oldcols*sizeof(int));
 97:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 98:   /* determine lens of each row */
 99:   for (i=0; i<nrows; i++) {
100:     kstart  = ai[irow[i]];
101:     kend    = kstart + a->ilen[irow[i]];
102:     lens[i] = 0;
103:       for (k=kstart; k<kend; k++) {
104:         if (ssmap[aj[k]]) {
105:           lens[i]++;
106:         }
107:       }
108:     }
109:   /* Create and fill new matrix */
110:   if (scall == MAT_REUSE_MATRIX) {
111:     c = (Mat_SeqBAIJ *)((*B)->data);

113:     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
114:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
115:     if (flag == PETSC_FALSE) {
116:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
117:     }
118:     PetscMemzero(c->ilen,c->mbs*sizeof(int));
119:     C = *B;
120:   } else {
121:     MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
122:   }
123:   c = (Mat_SeqBAIJ *)(C->data);
124:   for (i=0; i<nrows; i++) {
125:     row    = irow[i];
126:     kstart = ai[row];
127:     kend   = kstart + a->ilen[row];
128:     mat_i  = c->i[i];
129:     mat_j  = c->j + mat_i;
130:     mat_a  = c->a + mat_i*bs2;
131:     mat_ilen = c->ilen + i;
132:     for (k=kstart; k<kend; k++) {
133:       if ((tcol=ssmap[a->j[k]])) {
134:         *mat_j++ = tcol - 1;
135:         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
136:         mat_a   += bs2;
137:         (*mat_ilen)++;
138:       }
139:     }
140:   }
141: 
142:   /* Free work space */
143:   ISRestoreIndices(iscol,&icol);
144:   PetscFree(smap);
145:   PetscFree(lens);
146:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
147:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
148: 
149:   ISRestoreIndices(isrow,&irow);
150:   *B = C;
151:   return(0);
152: }

154: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
155: {
156:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
157:   IS          is1,is2;
158:   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;

161:   ISGetIndices(isrow,&irow);
162:   ISGetIndices(iscol,&icol);
163:   ISGetLocalSize(isrow,&nrows);
164:   ISGetLocalSize(iscol,&ncols);
165: 
166:   /* Verify if the indices corespond to each element in a block 
167:    and form the IS with compressed IS */
168:   PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
169:   iary = vary + a->mbs;
170:   PetscMemzero(vary,(a->mbs)*sizeof(int));
171:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
172:   count = 0;
173:   for (i=0; i<a->mbs; i++) {
174:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
175:     if (vary[i]==bs) iary[count++] = i;
176:   }
177:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
178: 
179:   PetscMemzero(vary,(a->mbs)*sizeof(int));
180:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
181:   count = 0;
182:   for (i=0; i<a->mbs; i++) {
183:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
184:     if (vary[i]==bs) iary[count++] = i;
185:   }
186:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
187:   ISRestoreIndices(isrow,&irow);
188:   ISRestoreIndices(iscol,&icol);
189:   PetscFree(vary);

191:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
192:   ISDestroy(is1);
193:   ISDestroy(is2);
194:   return(0);
195: }

197: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
198: {
199:   int ierr,i;

202:   if (scall == MAT_INITIAL_MATRIX) {
203:     PetscMalloc((n+1)*sizeof(Mat),B);
204:   }

206:   for (i=0; i<n; i++) {
207:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
208:   }
209:   return(0);
210: }


213: /* -------------------------------------------------------*/
214: /* Should check that shapes of vectors and matrices match */
215: /* -------------------------------------------------------*/
216:  #include petscblaslapack.h

218: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
219: {
220:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
221:   Scalar          *x,*z,sum;
222:   MatScalar       *v;
223:   int             mbs=a->mbs,i,*idx,*ii,n,ierr;

226:   VecGetArray(xx,&x);
227:   VecGetArray(zz,&z);

229:   idx   = a->j;
230:   v     = a->a;
231:   ii    = a->i;

233:   for (i=0; i<mbs; i++) {
234:     n    = ii[1] - ii[0]; ii++;
235:     sum  = 0.0;
236:     while (n--) sum += *v++ * x[*idx++];
237:     z[i] = sum;
238:   }
239:   VecRestoreArray(xx,&x);
240:   VecRestoreArray(zz,&z);
241:   PetscLogFlops(2*a->nz - A->m);
242:   return(0);
243: }

245: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
246: {
247:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
248:   Scalar          *x,*z,*xb,sum1,sum2;
249:   Scalar          x1,x2;
250:   MatScalar       *v;
251:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

254:   VecGetArray(xx,&x);
255:   VecGetArray(zz,&z);

257:   idx   = a->j;
258:   v     = a->a;
259:   ii    = a->i;

261:   for (i=0; i<mbs; i++) {
262:     n  = ii[1] - ii[0]; ii++;
263:     sum1 = 0.0; sum2 = 0.0;
264:     for (j=0; j<n; j++) {
265:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
266:       sum1 += v[0]*x1 + v[2]*x2;
267:       sum2 += v[1]*x1 + v[3]*x2;
268:       v += 4;
269:     }
270:     z[0] = sum1; z[1] = sum2;
271:     z += 2;
272:   }
273:   VecRestoreArray(xx,&x);
274:   VecRestoreArray(zz,&z);
275:   PetscLogFlops(8*a->nz - A->m);
276:   return(0);
277: }

279: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
280: {
281:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
282:   Scalar       *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
283:   MatScalar    *v;
284:   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;

286: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
287: #pragma disjoint(*v,*z,*xb)
288: #endif

291:   VecGetArray(xx,&x);
292:   VecGetArray(zz,&z);

294:   idx   = a->j;
295:   v     = a->a;
296:   ii    = a->i;

298:   for (i=0; i<mbs; i++) {
299:     n  = ii[1] - ii[0]; ii++;
300:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
301:     for (j=0; j<n; j++) {
302:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
303:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
304:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
305:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
306:       v += 9;
307:     }
308:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
309:     z += 3;
310:   }
311:   VecRestoreArray(xx,&x);
312:   VecRestoreArray(zz,&z);
313:   PetscLogFlops(18*a->nz - A->m);
314:   return(0);
315: }

317: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
318: {
319:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
320:   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
321:   MatScalar       *v;
322:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

325:   VecGetArray(xx,&x);
326:   VecGetArray(zz,&z);

328:   idx   = a->j;
329:   v     = a->a;
330:   ii    = a->i;

332:   for (i=0; i<mbs; i++) {
333:     n  = ii[1] - ii[0]; ii++;
334:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
335:     for (j=0; j<n; j++) {
336:       xb = x + 4*(*idx++);
337:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
338:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
339:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
340:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
341:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
342:       v += 16;
343:     }
344:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
345:     z += 4;
346:   }
347:   VecRestoreArray(xx,&x);
348:   VecRestoreArray(zz,&z);
349:   PetscLogFlops(32*a->nz - A->m);
350:   return(0);
351: }

353: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
354: {
355:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
356:   Scalar          sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
357:   MatScalar       *v;
358:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

361:   VecGetArray(xx,&x);
362:   VecGetArray(zz,&z);

364:   idx   = a->j;
365:   v     = a->a;
366:   ii    = a->i;

368:   for (i=0; i<mbs; i++) {
369:     n  = ii[1] - ii[0]; ii++;
370:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
371:     for (j=0; j<n; j++) {
372:       xb = x + 5*(*idx++);
373:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
374:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
375:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
376:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
377:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
378:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
379:       v += 25;
380:     }
381:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
382:     z += 5;
383:   }
384:   VecRestoreArray(xx,&x);
385:   VecRestoreArray(zz,&z);
386:   PetscLogFlops(50*a->nz - A->m);
387:   return(0);
388: }


391: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
392: {
393:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
394:   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
395:   Scalar          x1,x2,x3,x4,x5,x6;
396:   MatScalar       *v;
397:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

400:   VecGetArray(xx,&x);
401:   VecGetArray(zz,&z);

403:   idx   = a->j;
404:   v     = a->a;
405:   ii    = a->i;

407:   for (i=0; i<mbs; i++) {
408:     n  = ii[1] - ii[0]; ii++;
409:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
410:     for (j=0; j<n; j++) {
411:       xb = x + 6*(*idx++);
412:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
413:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
414:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
415:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
416:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
417:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
418:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
419:       v += 36;
420:     }
421:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
422:     z += 6;
423:   }

425:   VecRestoreArray(xx,&x);
426:   VecRestoreArray(zz,&z);
427:   PetscLogFlops(72*a->nz - A->m);
428:   return(0);
429: }
430: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
431: {
432:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
433:   Scalar          *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
434:   Scalar          x1,x2,x3,x4,x5,x6,x7;
435:   MatScalar       *v;
436:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

439:   VecGetArray(xx,&x);
440:   VecGetArray(zz,&z);

442:   idx   = a->j;
443:   v     = a->a;
444:   ii    = a->i;

446:   for (i=0; i<mbs; i++) {
447:     n  = ii[1] - ii[0]; ii++;
448:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
449:     for (j=0; j<n; j++) {
450:       xb = x + 7*(*idx++);
451:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
452:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
453:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
454:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
455:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
456:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
457:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
458:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
459:       v += 49;
460:     }
461:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
462:     z += 7;
463:   }

465:   VecRestoreArray(xx,&x);
466:   VecRestoreArray(zz,&z);
467:   PetscLogFlops(98*a->nz - A->m);
468:   return(0);
469: }

471: /*
472:     This will not work with MatScalar == float because it calls the BLAS
473: */
474: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
475: {
476:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
477:   Scalar          *x,*z,*xb,*work,*workt;
478:   MatScalar       *v;
479:   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
480:   int             ncols,k;

483:   VecGetArray(xx,&x);
484:   VecGetArray(zz,&z);

486:   idx   = a->j;
487:   v     = a->a;
488:   ii    = a->i;


491:   if (!a->mult_work) {
492:     k    = PetscMax(A->m,A->n);
493:     PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
494:   }
495:   work = a->mult_work;
496:   for (i=0; i<mbs; i++) {
497:     n     = ii[1] - ii[0]; ii++;
498:     ncols = n*bs;
499:     workt = work;
500:     for (j=0; j<n; j++) {
501:       xb = x + bs*(*idx++);
502:       for (k=0; k<bs; k++) workt[k] = xb[k];
503:       workt += bs;
504:     }
505:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
506:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
507:     v += n*bs2;
508:     z += bs;
509:   }
510:   VecRestoreArray(xx,&x);
511:   VecRestoreArray(zz,&z);
512:   PetscLogFlops(2*a->nz*bs2 - A->m);
513:   return(0);
514: }

516: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
517: {
518:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
519:   Scalar          *x,*y,*z,sum;
520:   MatScalar       *v;
521:   int             ierr,mbs=a->mbs,i,*idx,*ii,n;

524:   VecGetArray(xx,&x);
525:   VecGetArray(yy,&y);
526:   if (zz != yy) {
527:     VecGetArray(zz,&z);
528:   } else {
529:     z = y;
530:   }

532:   idx   = a->j;
533:   v     = a->a;
534:   ii    = a->i;

536:   for (i=0; i<mbs; i++) {
537:     n    = ii[1] - ii[0]; ii++;
538:     sum  = y[i];
539:     while (n--) sum += *v++ * x[*idx++];
540:     z[i] = sum;
541:   }
542:   VecRestoreArray(xx,&x);
543:   VecRestoreArray(yy,&y);
544:   if (zz != yy) {
545:     VecRestoreArray(zz,&z);
546:   }
547:   PetscLogFlops(2*a->nz);
548:   return(0);
549: }

551: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
552: {
553:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
554:   Scalar          *x,*y,*z,*xb,sum1,sum2;
555:   Scalar          x1,x2;
556:   MatScalar       *v;
557:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

560:   VecGetArray(xx,&x);
561:   VecGetArray(yy,&y);
562:   if (zz != yy) {
563:     VecGetArray(zz,&z);
564:   } else {
565:     z = y;
566:   }

568:   idx   = a->j;
569:   v     = a->a;
570:   ii    = a->i;

572:   for (i=0; i<mbs; i++) {
573:     n  = ii[1] - ii[0]; ii++;
574:     sum1 = y[0]; sum2 = y[1];
575:     for (j=0; j<n; j++) {
576:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
577:       sum1 += v[0]*x1 + v[2]*x2;
578:       sum2 += v[1]*x1 + v[3]*x2;
579:       v += 4;
580:     }
581:     z[0] = sum1; z[1] = sum2;
582:     z += 2; y += 2;
583:   }
584:   VecRestoreArray(xx,&x);
585:   VecRestoreArray(yy,&y);
586:   if (zz != yy) {
587:     VecRestoreArray(zz,&z);
588:   }
589:   PetscLogFlops(4*a->nz);
590:   return(0);
591: }

593: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
594: {
595:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
596:   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
597:   MatScalar       *v;
598:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

601:   VecGetArray(xx,&x);
602:   VecGetArray(yy,&y);
603:   if (zz != yy) {
604:     VecGetArray(zz,&z);
605:   } else {
606:     z = y;
607:   }

609:   idx   = a->j;
610:   v     = a->a;
611:   ii    = a->i;

613:   for (i=0; i<mbs; i++) {
614:     n  = ii[1] - ii[0]; ii++;
615:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
616:     for (j=0; j<n; j++) {
617:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
618:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
619:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
620:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
621:       v += 9;
622:     }
623:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
624:     z += 3; y += 3;
625:   }
626:   VecRestoreArray(xx,&x);
627:   VecRestoreArray(yy,&y);
628:   if (zz != yy) {
629:     VecRestoreArray(zz,&z);
630:   }
631:   PetscLogFlops(18*a->nz);
632:   return(0);
633: }

635: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
636: {
637:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
638:   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
639:   MatScalar       *v;
640:   int             ierr,mbs=a->mbs,i,*idx,*ii;
641:   int             j,n;

644:   VecGetArray(xx,&x);
645:   VecGetArray(yy,&y);
646:   if (zz != yy) {
647:     VecGetArray(zz,&z);
648:   } else {
649:     z = y;
650:   }

652:   idx   = a->j;
653:   v     = a->a;
654:   ii    = a->i;

656:   for (i=0; i<mbs; i++) {
657:     n  = ii[1] - ii[0]; ii++;
658:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
659:     for (j=0; j<n; j++) {
660:       xb = x + 4*(*idx++);
661:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
662:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
663:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
664:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
665:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
666:       v += 16;
667:     }
668:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
669:     z += 4; y += 4;
670:   }
671:   VecRestoreArray(xx,&x);
672:   VecRestoreArray(yy,&y);
673:   if (zz != yy) {
674:     VecRestoreArray(zz,&z);
675:   }
676:   PetscLogFlops(32*a->nz);
677:   return(0);
678: }

680: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
681: {
682:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
683:   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
684:   MatScalar       *v;
685:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

688:   VecGetArray(xx,&x);
689:   VecGetArray(yy,&y);
690:   if (zz != yy) {
691:     VecGetArray(zz,&z);
692:   } else {
693:     z = y;
694:   }

696:   idx   = a->j;
697:   v     = a->a;
698:   ii    = a->i;

700:   for (i=0; i<mbs; i++) {
701:     n  = ii[1] - ii[0]; ii++;
702:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
703:     for (j=0; j<n; j++) {
704:       xb = x + 5*(*idx++);
705:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
706:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
707:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
708:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
709:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
710:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
711:       v += 25;
712:     }
713:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
714:     z += 5; y += 5;
715:   }
716:   VecRestoreArray(xx,&x);
717:   VecRestoreArray(yy,&y);
718:   if (zz != yy) {
719:     VecRestoreArray(zz,&z);
720:   }
721:   PetscLogFlops(50*a->nz);
722:   return(0);
723: }
724: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
725: {
726:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
727:   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
728:   Scalar          x1,x2,x3,x4,x5,x6;
729:   MatScalar       *v;
730:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

733:   VecGetArray(xx,&x);
734:   VecGetArray(yy,&y);
735:   if (zz != yy) {
736:     VecGetArray(zz,&z);
737:   } else {
738:     z = y;
739:   }

741:   idx   = a->j;
742:   v     = a->a;
743:   ii    = a->i;

745:   for (i=0; i<mbs; i++) {
746:     n  = ii[1] - ii[0]; ii++;
747:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
748:     for (j=0; j<n; j++) {
749:       xb = x + 6*(*idx++);
750:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
751:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
752:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
753:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
754:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
755:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
756:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
757:       v += 36;
758:     }
759:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
760:     z += 6; y += 6;
761:   }
762:   VecRestoreArray(xx,&x);
763:   VecRestoreArray(yy,&y);
764:   if (zz != yy) {
765:     VecRestoreArray(zz,&z);
766:   }
767:   PetscLogFlops(72*a->nz);
768:   return(0);
769: }

771: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
772: {
773:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
774:   Scalar          *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
775:   Scalar          x1,x2,x3,x4,x5,x6,x7;
776:   MatScalar       *v;
777:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

780:   VecGetArray(xx,&x);
781:   VecGetArray(yy,&y);
782:   if (zz != yy) {
783:     VecGetArray(zz,&z);
784:   } else {
785:     z = y;
786:   }

788:   idx   = a->j;
789:   v     = a->a;
790:   ii    = a->i;

792:   for (i=0; i<mbs; i++) {
793:     n  = ii[1] - ii[0]; ii++;
794:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
795:     for (j=0; j<n; j++) {
796:       xb = x + 7*(*idx++);
797:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
798:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
799:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
800:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
801:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
802:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
803:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
804:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
805:       v += 49;
806:     }
807:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
808:     z += 7; y += 7;
809:   }
810:   VecRestoreArray(xx,&x);
811:   VecRestoreArray(yy,&y);
812:   if (zz != yy) {
813:     VecRestoreArray(zz,&z);
814:   }
815:   PetscLogFlops(98*a->nz);
816:   return(0);
817: }

819: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
820: {
821:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
822:   Scalar         *x,*z,*xb,*work,*workt;
823:   MatScalar      *v;
824:   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
825:   int            ncols,k;

828:   if (xx != yy) { VecCopy(yy,zz); }

830:   VecGetArray(xx,&x);
831:   VecGetArray(zz,&z);
832: 
833:   idx   = a->j;
834:   v     = a->a;
835:   ii    = a->i;


838:   if (!a->mult_work) {
839:     k    = PetscMax(A->m,A->n);
840:     PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
841:   }
842:   work = a->mult_work;
843:   for (i=0; i<mbs; i++) {
844:     n     = ii[1] - ii[0]; ii++;
845:     ncols = n*bs;
846:     workt = work;
847:     for (j=0; j<n; j++) {
848:       xb = x + bs*(*idx++);
849:       for (k=0; k<bs; k++) workt[k] = xb[k];
850:       workt += bs;
851:     }
852:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
853:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
854:     v += n*bs2;
855:     z += bs;
856:   }
857:   VecRestoreArray(xx,&x);
858:   VecRestoreArray(zz,&z);
859:   PetscLogFlops(2*a->nz*bs2);
860:   return(0);
861: }

863: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
864: {
865:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
866:   Scalar          *xg,*zg,*zb,zero = 0.0;
867:   Scalar          *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
868:   MatScalar       *v;
869:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
870:   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

873:   VecSet(&zero,zz);
874:   VecGetArray(xx,&xg); x = xg;
875:   VecGetArray(zz,&zg); z = zg;

877:   idx   = a->j;
878:   v     = a->a;
879:   ii    = a->i;
880:   xb    = x;
881:   switch (bs) {
882:   case 1:
883:     for (i=0; i<mbs; i++) {
884:       n  = ii[1] - ii[0]; ii++;
885:       x1 = xb[0];
886:       ib = idx + ai[i];
887:       for (j=0; j<n; j++) {
888:         rval    = ib[j];
889:         z[rval] += *v * x1;
890:         v++;
891:       }
892:       xb++;
893:     }
894:     break;
895:   case 2:
896:     for (i=0; i<mbs; i++) {
897:       n  = ii[1] - ii[0]; ii++;
898:       x1 = xb[0]; x2 = xb[1];
899:       ib = idx + ai[i];
900:       for (j=0; j<n; j++) {
901:         rval      = ib[j]*2;
902:         z[rval++] += v[0]*x1 + v[1]*x2;
903:         z[rval]   += v[2]*x1 + v[3]*x2;
904:         v  += 4;
905:       }
906:       xb += 2;
907:     }
908:     break;
909:   case 3:
910:     for (i=0; i<mbs; i++) {
911:       n  = ii[1] - ii[0]; ii++;
912:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
913:       ib = idx + ai[i];
914:       for (j=0; j<n; j++) {
915:         rval      = ib[j]*3;
916:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
917:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
918:         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
919:         v  += 9;
920:       }
921:       xb += 3;
922:     }
923:     break;
924:   case 4:
925:     for (i=0; i<mbs; i++) {
926:       n  = ii[1] - ii[0]; ii++;
927:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
928:       ib = idx + ai[i];
929:       for (j=0; j<n; j++) {
930:         rval      = ib[j]*4;
931:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
932:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
933:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
934:         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
935:         v  += 16;
936:       }
937:       xb += 4;
938:     }
939:     break;
940:   case 5:
941:     for (i=0; i<mbs; i++) {
942:       n  = ii[1] - ii[0]; ii++;
943:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
944:       x4 = xb[3]; x5 = xb[4];
945:       ib = idx + ai[i];
946:       for (j=0; j<n; j++) {
947:         rval      = ib[j]*5;
948:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
949:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
950:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
951:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
952:         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
953:         v  += 25;
954:       }
955:       xb += 5;
956:     }
957:     break;
958:   case 6:
959:     for (i=0; i<mbs; i++) {
960:       n  = ii[1] - ii[0]; ii++;
961:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
962:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
963:       ib = idx + ai[i];
964:       for (j=0; j<n; j++) {
965:         rval      = ib[j]*6;
966:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
967:         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
968:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
969:         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
970:         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
971:         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
972:         v  += 36;
973:       }
974:       xb += 6;
975:     }
976:     break;
977:   case 7:
978:     for (i=0; i<mbs; i++) {
979:       n  = ii[1] - ii[0]; ii++;
980:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
981:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
982:       ib = idx + ai[i];
983:       for (j=0; j<n; j++) {
984:         rval      = ib[j]*7;
985:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
986:         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
987:         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
988:         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
989:         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
990:         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
991:         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
992:         v  += 49;
993:       }
994:       xb += 7;
995:     }
996:     break;
997:   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
998:       int       ncols,k;
999:       Scalar    *work,*workt;

1001:       if (!a->mult_work) {
1002:         k = PetscMax(A->m,A->n);
1003:         PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
1004:       }
1005:       work = a->mult_work;
1006:       for (i=0; i<mbs; i++) {
1007:         n     = ii[1] - ii[0]; ii++;
1008:         ncols = n*bs;
1009:         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));
1010:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1011:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1012:         v += n*bs2;
1013:         x += bs;
1014:         workt = work;
1015:         for (j=0; j<n; j++) {
1016:           zb = z + bs*(*idx++);
1017:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1018:           workt += bs;
1019:         }
1020:       }
1021:     }
1022:   }
1023:   VecRestoreArray(xx,&xg);
1024:   VecRestoreArray(zz,&zg);
1025:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1026:   return(0);
1027: }

1029: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1031: {
1032:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1033:   Scalar          *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1034:   MatScalar       *v;
1035:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

1038:   VecGetArray(xx,&xg); x = xg;
1039:   VecGetArray(zz,&zg); z = zg;

1041:   if (yy != zz) { VecCopy(yy,zz); }

1043:   idx   = a->j;
1044:   v     = a->a;
1045:   ii    = a->i;
1046:   xb    = x;

1048:   switch (bs) {
1049:   case 1:
1050:     for (i=0; i<mbs; i++) {
1051:       n  = ii[1] - ii[0]; ii++;
1052:       x1 = xb[0];
1053:       ib = idx + ai[i];
1054:       for (j=0; j<n; j++) {
1055:         rval    = ib[j];
1056:         z[rval] += *v * x1;
1057:         v++;
1058:       }
1059:       xb++;
1060:     }
1061:     break;
1062:   case 2:
1063:     for (i=0; i<mbs; i++) {
1064:       n  = ii[1] - ii[0]; ii++;
1065:       x1 = xb[0]; x2 = xb[1];
1066:       ib = idx + ai[i];
1067:       for (j=0; j<n; j++) {
1068:         rval      = ib[j]*2;
1069:         z[rval++] += v[0]*x1 + v[1]*x2;
1070:         z[rval++] += v[2]*x1 + v[3]*x2;
1071:         v  += 4;
1072:       }
1073:       xb += 2;
1074:     }
1075:     break;
1076:   case 3:
1077:     for (i=0; i<mbs; i++) {
1078:       n  = ii[1] - ii[0]; ii++;
1079:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1080:       ib = idx + ai[i];
1081:       for (j=0; j<n; j++) {
1082:         rval      = ib[j]*3;
1083:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1084:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1085:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1086:         v  += 9;
1087:       }
1088:       xb += 3;
1089:     }
1090:     break;
1091:   case 4:
1092:     for (i=0; i<mbs; i++) {
1093:       n  = ii[1] - ii[0]; ii++;
1094:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1095:       ib = idx + ai[i];
1096:       for (j=0; j<n; j++) {
1097:         rval      = ib[j]*4;
1098:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1099:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1100:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1101:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1102:         v  += 16;
1103:       }
1104:       xb += 4;
1105:     }
1106:     break;
1107:   case 5:
1108:     for (i=0; i<mbs; i++) {
1109:       n  = ii[1] - ii[0]; ii++;
1110:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1111:       x4 = xb[3]; x5 = xb[4];
1112:       ib = idx + ai[i];
1113:       for (j=0; j<n; j++) {
1114:         rval      = ib[j]*5;
1115:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1116:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1117:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1118:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1119:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1120:         v  += 25;
1121:       }
1122:       xb += 5;
1123:     }
1124:     break;
1125:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1126:       int       ncols,k;
1127:       Scalar    *work,*workt;

1129:       if (!a->mult_work) {
1130:         k = PetscMax(A->m,A->n);
1131:         PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
1132:       }
1133:       work = a->mult_work;
1134:       for (i=0; i<mbs; i++) {
1135:         n     = ii[1] - ii[0]; ii++;
1136:         ncols = n*bs;
1137:         ierr  = PetscMemzero(work,ncols*sizeof(Scalar));
1138:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1139:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1140:         v += n*bs2;
1141:         x += bs;
1142:         workt = work;
1143:         for (j=0; j<n; j++) {
1144:           zb = z + bs*(*idx++);
1145:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1146:           workt += bs;
1147:         }
1148:       }
1149:     }
1150:   }
1151:   VecRestoreArray(xx,&xg);
1152:   VecRestoreArray(zz,&zg);
1153:   PetscLogFlops(2*a->nz*a->bs2);
1154:   return(0);
1155: }

1157: int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1158: {
1159:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1160:   int         totalnz = a->bs2*a->nz;
1161: #if defined(PETSC_USE_MAT_SINGLE)
1162:   int         i;
1163: #else
1164:   int         one = 1;
1165: #endif

1168: #if defined(PETSC_USE_MAT_SINGLE)
1169:   for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1170: #else
1171:   BLscal_(&totalnz,alpha,a->a,&one);
1172: #endif
1173:   PetscLogFlops(totalnz);
1174:   return(0);
1175: }

1177: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1178: {
1179:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1180:   MatScalar   *v = a->a;
1181:   PetscReal   sum = 0.0;
1182:   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;

1185:   if (type == NORM_FROBENIUS) {
1186:     for (i=0; i< bs2*nz; i++) {
1187: #if defined(PETSC_USE_COMPLEX)
1188:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1189: #else
1190:       sum += (*v)*(*v); v++;
1191: #endif
1192:     }
1193:     *norm = sqrt(sum);
1194:   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1195:     *norm = 0.0;
1196:     for (k=0; k<bs; k++) {
1197:       for (j=0; j<a->mbs; j++) {
1198:         v = a->a + bs2*a->i[j] + k;
1199:         sum = 0.0;
1200:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1201:           for (k1=0; k1<bs; k1++){
1202:             sum += PetscAbsScalar(*v);
1203:             v   += bs;
1204:           }
1205:         }
1206:         if (sum > *norm) *norm = sum;
1207:       }
1208:     }
1209:   } else {
1210:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1211:   }
1212:   return(0);
1213: }


1216: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1217: {
1218:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1219:   int         ierr;
1220:   PetscTruth  flag;

1223:   PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1224:   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");

1226:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1227:   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1228:     *flg = PETSC_FALSE;
1229:     return(0);
1230:   }
1231: 
1232:   /* if the a->i are the same */
1233:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1234:   if (*flg == PETSC_FALSE) {
1235:     return(0);
1236:   }
1237: 
1238:   /* if a->j are the same */
1239:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1240:   if (*flg == PETSC_FALSE) {
1241:     return(0);
1242:   }
1243:   /* if a->a are the same */
1244:   PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);
1245:   return(0);
1246: 
1247: }

1249: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1250: {
1251:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1252:   int         ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1253:   Scalar      *x,zero = 0.0;
1254:   MatScalar   *aa,*aa_j;

1257:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1258:   bs   = a->bs;
1259:   aa   = a->a;
1260:   ai   = a->i;
1261:   aj   = a->j;
1262:   ambs = a->mbs;
1263:   bs2  = a->bs2;

1265:   VecSet(&zero,v);
1266:   VecGetArray(v,&x);
1267:   VecGetLocalSize(v,&n);
1268:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1269:   for (i=0; i<ambs; i++) {
1270:     for (j=ai[i]; j<ai[i+1]; j++) {
1271:       if (aj[j] == i) {
1272:         row  = i*bs;
1273:         aa_j = aa+j*bs2;
1274:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1275:         break;
1276:       }
1277:     }
1278:   }
1279:   VecRestoreArray(v,&x);
1280:   return(0);
1281: }

1283: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1284: {
1285:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1286:   Scalar      *l,*r,x,*li,*ri;
1287:   MatScalar   *aa,*v;
1288:   int         ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1291:   ai  = a->i;
1292:   aj  = a->j;
1293:   aa  = a->a;
1294:   m   = A->m;
1295:   n   = A->n;
1296:   bs  = a->bs;
1297:   mbs = a->mbs;
1298:   bs2 = a->bs2;
1299:   if (ll) {
1300:     VecGetArray(ll,&l);
1301:     VecGetLocalSize(ll,&lm);
1302:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1303:     for (i=0; i<mbs; i++) { /* for each block row */
1304:       M  = ai[i+1] - ai[i];
1305:       li = l + i*bs;
1306:       v  = aa + bs2*ai[i];
1307:       for (j=0; j<M; j++) { /* for each block */
1308:         for (k=0; k<bs2; k++) {
1309:           (*v++) *= li[k%bs];
1310:         }
1311:       }
1312:     }
1313:     VecRestoreArray(ll,&l);
1314:     PetscLogFlops(a->nz);
1315:   }
1316: 
1317:   if (rr) {
1318:     VecGetArray(rr,&r);
1319:     VecGetLocalSize(rr,&rn);
1320:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1321:     for (i=0; i<mbs; i++) { /* for each block row */
1322:       M  = ai[i+1] - ai[i];
1323:       v  = aa + bs2*ai[i];
1324:       for (j=0; j<M; j++) { /* for each block */
1325:         ri = r + bs*aj[ai[i]+j];
1326:         for (k=0; k<bs; k++) {
1327:           x = ri[k];
1328:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1329:         }
1330:       }
1331:     }
1332:     VecRestoreArray(rr,&r);
1333:     PetscLogFlops(a->nz);
1334:   }
1335:   return(0);
1336: }


1339: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1340: {
1341:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1344:   info->rows_global    = (double)A->m;
1345:   info->columns_global = (double)A->n;
1346:   info->rows_local     = (double)A->m;
1347:   info->columns_local  = (double)A->n;
1348:   info->block_size     = a->bs2;
1349:   info->nz_allocated   = a->maxnz;
1350:   info->nz_used        = a->bs2*a->nz;
1351:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1352:   info->assemblies   = A->num_ass;
1353:   info->mallocs      = a->reallocs;
1354:   info->memory       = A->mem;
1355:   if (A->factor) {
1356:     info->fill_ratio_given  = A->info.fill_ratio_given;
1357:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1358:     info->factor_mallocs    = A->info.factor_mallocs;
1359:   } else {
1360:     info->fill_ratio_given  = 0;
1361:     info->fill_ratio_needed = 0;
1362:     info->factor_mallocs    = 0;
1363:   }
1364:   return(0);
1365: }


1368: int MatZeroEntries_SeqBAIJ(Mat A)
1369: {
1370:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1371:   int         ierr;

1374:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1375:   return(0);
1376: }