Actual source code: baij2.c

  1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/vec/vecimpl.h
 5:  #include src/inline/spops.h
 6:  #include src/inline/ilu.h
 7:  #include petscbt.h

  9: #undef __FUNCT__  
 11: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
 12: {
 13:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
 14:   int         row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
 15:   int         start,end,*ai,*aj,bs,*nidx2;
 16:   PetscBT     table;

 19:   m     = a->mbs;
 20:   ai    = a->i;
 21:   aj    = a->j;
 22:   bs    = a->bs;

 24:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 26:   PetscBTCreate(m,table);
 27:   PetscMalloc((m+1)*sizeof(int),&nidx);
 28:   PetscMalloc((A->m+1)*sizeof(int),&nidx2);

 30:   for (i=0; i<is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz  = 0;
 33:     PetscBTMemzero(m,table);
 34: 
 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     ISGetIndices(is[i],&idx);
 37:     ISGetLocalSize(is[i],&n);

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j=0; j<n ; ++j){
 41:       ival = idx[j]/bs; /* convert the indices into block indices */
 42:       if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 43:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 44:     }
 45:     ISRestoreIndices(is[i],&idx);
 46:     ISDestroy(is[i]);
 47: 
 48:     k = 0;
 49:     for (j=0; j<ov; j++){ /* for each overlap*/
 50:       n = isz;
 51:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row+1];
 55:         for (l = start; l<end ; l++){
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 58:         }
 59:       }
 60:     }
 61:     /* expand the Index Set */
 62:     for (j=0; j<isz; j++) {
 63:       for (k=0; k<bs; k++)
 64:         nidx2[j*bs+k] = nidx[j]*bs+k;
 65:     }
 66:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 67:   }
 68:   PetscBTDestroy(table);
 69:   PetscFree(nidx);
 70:   PetscFree(nidx2);
 71:   return(0);
 72: }

 74: #undef __FUNCT__  
 76: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*c;
 79:   int          *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
 80:   int          row,mat_i,*mat_j,tcol,*mat_ilen;
 81:   int          *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
 82:   int          *aj = a->j,*ai = a->i;
 83:   MatScalar    *mat_a;
 84:   Mat          C;
 85:   PetscTruth   flag;

 88:   ISSorted(iscol,(PetscTruth*)&i);
 89:   if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 91:   ISGetIndices(isrow,&irow);
 92:   ISGetIndices(iscol,&icol);
 93:   ISGetLocalSize(isrow,&nrows);
 94:   ISGetLocalSize(iscol,&ncols);

 96:   PetscMalloc((1+oldcols)*sizeof(int),&smap);
 97:   ssmap = smap;
 98:   PetscMalloc((1+nrows)*sizeof(int),&lens);
 99:   ierr  = PetscMemzero(smap,oldcols*sizeof(int));
100:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
101:   /* determine lens of each row */
102:   for (i=0; i<nrows; i++) {
103:     kstart  = ai[irow[i]];
104:     kend    = kstart + a->ilen[irow[i]];
105:     lens[i] = 0;
106:       for (k=kstart; k<kend; k++) {
107:         if (ssmap[aj[k]]) {
108:           lens[i]++;
109:         }
110:       }
111:     }
112:   /* Create and fill new matrix */
113:   if (scall == MAT_REUSE_MATRIX) {
114:     c = (Mat_SeqBAIJ *)((*B)->data);

116:     if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
117:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
118:     if (flag == PETSC_FALSE) {
119:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120:     }
121:     PetscMemzero(c->ilen,c->mbs*sizeof(int));
122:     C = *B;
123:   } else {
124:     MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
125:   }
126:   c = (Mat_SeqBAIJ *)(C->data);
127:   for (i=0; i<nrows; i++) {
128:     row    = irow[i];
129:     kstart = ai[row];
130:     kend   = kstart + a->ilen[row];
131:     mat_i  = c->i[i];
132:     mat_j  = c->j + mat_i;
133:     mat_a  = c->a + mat_i*bs2;
134:     mat_ilen = c->ilen + i;
135:     for (k=kstart; k<kend; k++) {
136:       if ((tcol=ssmap[a->j[k]])) {
137:         *mat_j++ = tcol - 1;
138:         ierr     = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
139:         mat_a   += bs2;
140:         (*mat_ilen)++;
141:       }
142:     }
143:   }
144: 
145:   /* Free work space */
146:   ISRestoreIndices(iscol,&icol);
147:   PetscFree(smap);
148:   PetscFree(lens);
149:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
150:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
151: 
152:   ISRestoreIndices(isrow,&irow);
153:   *B = C;
154:   return(0);
155: }

157: #undef __FUNCT__  
159: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
160: {
161:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
162:   IS          is1,is2;
163:   int         *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;

166:   ISGetIndices(isrow,&irow);
167:   ISGetIndices(iscol,&icol);
168:   ISGetLocalSize(isrow,&nrows);
169:   ISGetLocalSize(iscol,&ncols);
170: 
171:   /* Verify if the indices corespond to each element in a block 
172:    and form the IS with compressed IS */
173:   PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
174:   iary = vary + a->mbs;
175:   PetscMemzero(vary,(a->mbs)*sizeof(int));
176:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
177:   count = 0;
178:   for (i=0; i<a->mbs; i++) {
179:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
180:     if (vary[i]==bs) iary[count++] = i;
181:   }
182:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
183: 
184:   PetscMemzero(vary,(a->mbs)*sizeof(int));
185:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
186:   count = 0;
187:   for (i=0; i<a->mbs; i++) {
188:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
189:     if (vary[i]==bs) iary[count++] = i;
190:   }
191:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
192:   ISRestoreIndices(isrow,&irow);
193:   ISRestoreIndices(iscol,&icol);
194:   PetscFree(vary);

196:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
197:   ISDestroy(is1);
198:   ISDestroy(is2);
199:   return(0);
200: }

202: #undef __FUNCT__  
204: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
205: {
206:   int ierr,i;

209:   if (scall == MAT_INITIAL_MATRIX) {
210:     PetscMalloc((n+1)*sizeof(Mat),B);
211:   }

213:   for (i=0; i<n; i++) {
214:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
215:   }
216:   return(0);
217: }


220: /* -------------------------------------------------------*/
221: /* Should check that shapes of vectors and matrices match */
222: /* -------------------------------------------------------*/
223:  #include petscblaslapack.h

225: #undef __FUNCT__  
227: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
228: {
229:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
230:   PetscScalar     *x,*z,sum;
231:   MatScalar       *v;
232:   int             mbs=a->mbs,i,*idx,*ii,n,ierr;

235:   VecGetArray(xx,&x);
236:   VecGetArray(zz,&z);

238:   idx   = a->j;
239:   v     = a->a;
240:   ii    = a->i;

242:   for (i=0; i<mbs; i++) {
243:     n    = ii[1] - ii[0]; ii++;
244:     sum  = 0.0;
245:     while (n--) sum += *v++ * x[*idx++];
246:     z[i] = sum;
247:   }
248:   VecRestoreArray(xx,&x);
249:   VecRestoreArray(zz,&z);
250:   PetscLogFlops(2*a->nz - A->m);
251:   return(0);
252: }

254: #undef __FUNCT__  
256: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
257: {
258:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
259:   PetscScalar     *x,*z,*xb,sum1,sum2;
260:   PetscScalar     x1,x2;
261:   MatScalar       *v;
262:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

265:   VecGetArray(xx,&x);
266:   VecGetArray(zz,&z);

268:   idx   = a->j;
269:   v     = a->a;
270:   ii    = a->i;

272:   for (i=0; i<mbs; i++) {
273:     n  = ii[1] - ii[0]; ii++;
274:     sum1 = 0.0; sum2 = 0.0;
275:     for (j=0; j<n; j++) {
276:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
277:       sum1 += v[0]*x1 + v[2]*x2;
278:       sum2 += v[1]*x1 + v[3]*x2;
279:       v += 4;
280:     }
281:     z[0] = sum1; z[1] = sum2;
282:     z += 2;
283:   }
284:   VecRestoreArray(xx,&x);
285:   VecRestoreArray(zz,&z);
286:   PetscLogFlops(8*a->nz - A->m);
287:   return(0);
288: }

290: #undef __FUNCT__  
292: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
293: {
294:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
295:   PetscScalar  *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
296:   MatScalar    *v;
297:   int          ierr,mbs=a->mbs,i,*idx,*ii,j,n;

299: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
300: #pragma disjoint(*v,*z,*xb)
301: #endif

304:   VecGetArray(xx,&x);
305:   VecGetArray(zz,&z);

307:   idx   = a->j;
308:   v     = a->a;
309:   ii    = a->i;

311:   for (i=0; i<mbs; i++) {
312:     n  = ii[1] - ii[0]; ii++;
313:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
314:     for (j=0; j<n; j++) {
315:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
316:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
317:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
318:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
319:       v += 9;
320:     }
321:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
322:     z += 3;
323:   }
324:   VecRestoreArray(xx,&x);
325:   VecRestoreArray(zz,&z);
326:   PetscLogFlops(18*a->nz - A->m);
327:   return(0);
328: }

330: #undef __FUNCT__  
332: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
333: {
334:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
335:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
336:   MatScalar       *v;
337:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

340:   VecGetArray(xx,&x);
341:   VecGetArray(zz,&z);

343:   idx   = a->j;
344:   v     = a->a;
345:   ii    = a->i;

347:   for (i=0; i<mbs; i++) {
348:     n  = ii[1] - ii[0]; ii++;
349:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
350:     for (j=0; j<n; j++) {
351:       xb = x + 4*(*idx++);
352:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
353:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
354:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
355:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
356:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
357:       v += 16;
358:     }
359:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
360:     z += 4;
361:   }
362:   VecRestoreArray(xx,&x);
363:   VecRestoreArray(zz,&z);
364:   PetscLogFlops(32*a->nz - A->m);
365:   return(0);
366: }

368: #undef __FUNCT__  
370: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
371: {
372:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
373:   PetscScalar     sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
374:   MatScalar       *v;
375:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

378:   VecGetArray(xx,&x);
379:   VecGetArray(zz,&z);

381:   idx   = a->j;
382:   v     = a->a;
383:   ii    = a->i;

385:   for (i=0; i<mbs; i++) {
386:     n  = ii[1] - ii[0]; ii++;
387:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
388:     for (j=0; j<n; j++) {
389:       xb = x + 5*(*idx++);
390:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
391:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
392:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
393:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
394:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
395:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
396:       v += 25;
397:     }
398:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
399:     z += 5;
400:   }
401:   VecRestoreArray(xx,&x);
402:   VecRestoreArray(zz,&z);
403:   PetscLogFlops(50*a->nz - A->m);
404:   return(0);
405: }


408: #undef __FUNCT__  
410: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
411: {
412:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
413:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
414:   PetscScalar     x1,x2,x3,x4,x5,x6;
415:   MatScalar       *v;
416:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

419:   VecGetArray(xx,&x);
420:   VecGetArray(zz,&z);

422:   idx   = a->j;
423:   v     = a->a;
424:   ii    = a->i;

426:   for (i=0; i<mbs; i++) {
427:     n  = ii[1] - ii[0]; ii++;
428:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
429:     for (j=0; j<n; j++) {
430:       xb = x + 6*(*idx++);
431:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
432:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
433:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
434:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
435:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
436:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
437:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
438:       v += 36;
439:     }
440:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
441:     z += 6;
442:   }

444:   VecRestoreArray(xx,&x);
445:   VecRestoreArray(zz,&z);
446:   PetscLogFlops(72*a->nz - A->m);
447:   return(0);
448: }
449: #undef __FUNCT__  
451: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
452: {
453:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
454:   PetscScalar     *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
455:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
456:   MatScalar       *v;
457:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

460:   VecGetArray(xx,&x);
461:   VecGetArray(zz,&z);

463:   idx   = a->j;
464:   v     = a->a;
465:   ii    = a->i;

467:   for (i=0; i<mbs; i++) {
468:     n  = ii[1] - ii[0]; ii++;
469:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
470:     for (j=0; j<n; j++) {
471:       xb = x + 7*(*idx++);
472:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
473:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
474:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
475:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
476:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
477:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
478:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
479:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
480:       v += 49;
481:     }
482:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
483:     z += 7;
484:   }

486:   VecRestoreArray(xx,&x);
487:   VecRestoreArray(zz,&z);
488:   PetscLogFlops(98*a->nz - A->m);
489:   return(0);
490: }

492: /*
493:     This will not work with MatScalar == float because it calls the BLAS
494: */
495: #undef __FUNCT__  
497: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
498: {
499:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
500:   PetscScalar     *x,*z,*xb,*work,*workt;
501:   MatScalar       *v;
502:   int             ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
503:   int             ncols,k;

506:   VecGetArray(xx,&x);
507:   VecGetArray(zz,&z);

509:   idx   = a->j;
510:   v     = a->a;
511:   ii    = a->i;


514:   if (!a->mult_work) {
515:     k    = PetscMax(A->m,A->n);
516:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
517:   }
518:   work = a->mult_work;
519:   for (i=0; i<mbs; i++) {
520:     n     = ii[1] - ii[0]; ii++;
521:     ncols = n*bs;
522:     workt = work;
523:     for (j=0; j<n; j++) {
524:       xb = x + bs*(*idx++);
525:       for (k=0; k<bs; k++) workt[k] = xb[k];
526:       workt += bs;
527:     }
528:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
529:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
530:     v += n*bs2;
531:     z += bs;
532:   }
533:   VecRestoreArray(xx,&x);
534:   VecRestoreArray(zz,&z);
535:   PetscLogFlops(2*a->nz*bs2 - A->m);
536:   return(0);
537: }

539: #undef __FUNCT__  
541: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
542: {
543:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
544:   PetscScalar     *x,*y,*z,sum;
545:   MatScalar       *v;
546:   int             ierr,mbs=a->mbs,i,*idx,*ii,n;

549:   VecGetArray(xx,&x);
550:   VecGetArray(yy,&y);
551:   if (zz != yy) {
552:     VecGetArray(zz,&z);
553:   } else {
554:     z = y;
555:   }

557:   idx   = a->j;
558:   v     = a->a;
559:   ii    = a->i;

561:   for (i=0; i<mbs; i++) {
562:     n    = ii[1] - ii[0]; ii++;
563:     sum  = y[i];
564:     while (n--) sum += *v++ * x[*idx++];
565:     z[i] = sum;
566:   }
567:   VecRestoreArray(xx,&x);
568:   VecRestoreArray(yy,&y);
569:   if (zz != yy) {
570:     VecRestoreArray(zz,&z);
571:   }
572:   PetscLogFlops(2*a->nz);
573:   return(0);
574: }

576: #undef __FUNCT__  
578: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
579: {
580:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
581:   PetscScalar     *x,*y,*z,*xb,sum1,sum2;
582:   PetscScalar     x1,x2;
583:   MatScalar       *v;
584:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

587:   VecGetArray(xx,&x);
588:   VecGetArray(yy,&y);
589:   if (zz != yy) {
590:     VecGetArray(zz,&z);
591:   } else {
592:     z = y;
593:   }

595:   idx   = a->j;
596:   v     = a->a;
597:   ii    = a->i;

599:   for (i=0; i<mbs; i++) {
600:     n  = ii[1] - ii[0]; ii++;
601:     sum1 = y[0]; sum2 = y[1];
602:     for (j=0; j<n; j++) {
603:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
604:       sum1 += v[0]*x1 + v[2]*x2;
605:       sum2 += v[1]*x1 + v[3]*x2;
606:       v += 4;
607:     }
608:     z[0] = sum1; z[1] = sum2;
609:     z += 2; y += 2;
610:   }
611:   VecRestoreArray(xx,&x);
612:   VecRestoreArray(yy,&y);
613:   if (zz != yy) {
614:     VecRestoreArray(zz,&z);
615:   }
616:   PetscLogFlops(4*a->nz);
617:   return(0);
618: }

620: #undef __FUNCT__  
622: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
623: {
624:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
625:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
626:   MatScalar       *v;
627:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

630:   VecGetArray(xx,&x);
631:   VecGetArray(yy,&y);
632:   if (zz != yy) {
633:     VecGetArray(zz,&z);
634:   } else {
635:     z = y;
636:   }

638:   idx   = a->j;
639:   v     = a->a;
640:   ii    = a->i;

642:   for (i=0; i<mbs; i++) {
643:     n  = ii[1] - ii[0]; ii++;
644:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
645:     for (j=0; j<n; j++) {
646:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
647:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
648:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
649:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
650:       v += 9;
651:     }
652:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
653:     z += 3; y += 3;
654:   }
655:   VecRestoreArray(xx,&x);
656:   VecRestoreArray(yy,&y);
657:   if (zz != yy) {
658:     VecRestoreArray(zz,&z);
659:   }
660:   PetscLogFlops(18*a->nz);
661:   return(0);
662: }

664: #undef __FUNCT__  
666: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
667: {
668:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
669:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
670:   MatScalar       *v;
671:   int             ierr,mbs=a->mbs,i,*idx,*ii;
672:   int             j,n;

675:   VecGetArray(xx,&x);
676:   VecGetArray(yy,&y);
677:   if (zz != yy) {
678:     VecGetArray(zz,&z);
679:   } else {
680:     z = y;
681:   }

683:   idx   = a->j;
684:   v     = a->a;
685:   ii    = a->i;

687:   for (i=0; i<mbs; i++) {
688:     n  = ii[1] - ii[0]; ii++;
689:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
690:     for (j=0; j<n; j++) {
691:       xb = x + 4*(*idx++);
692:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
693:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
694:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
695:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
696:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
697:       v += 16;
698:     }
699:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
700:     z += 4; y += 4;
701:   }
702:   VecRestoreArray(xx,&x);
703:   VecRestoreArray(yy,&y);
704:   if (zz != yy) {
705:     VecRestoreArray(zz,&z);
706:   }
707:   PetscLogFlops(32*a->nz);
708:   return(0);
709: }

711: #undef __FUNCT__  
713: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
714: {
715:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
716:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
717:   MatScalar       *v;
718:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

721:   VecGetArray(xx,&x);
722:   VecGetArray(yy,&y);
723:   if (zz != yy) {
724:     VecGetArray(zz,&z);
725:   } else {
726:     z = y;
727:   }

729:   idx   = a->j;
730:   v     = a->a;
731:   ii    = a->i;

733:   for (i=0; i<mbs; i++) {
734:     n  = ii[1] - ii[0]; ii++;
735:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
736:     for (j=0; j<n; j++) {
737:       xb = x + 5*(*idx++);
738:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
739:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
740:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
741:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
742:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
743:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
744:       v += 25;
745:     }
746:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
747:     z += 5; y += 5;
748:   }
749:   VecRestoreArray(xx,&x);
750:   VecRestoreArray(yy,&y);
751:   if (zz != yy) {
752:     VecRestoreArray(zz,&z);
753:   }
754:   PetscLogFlops(50*a->nz);
755:   return(0);
756: }
757: #undef __FUNCT__  
759: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
760: {
761:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
762:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
763:   PetscScalar     x1,x2,x3,x4,x5,x6;
764:   MatScalar       *v;
765:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

768:   VecGetArray(xx,&x);
769:   VecGetArray(yy,&y);
770:   if (zz != yy) {
771:     VecGetArray(zz,&z);
772:   } else {
773:     z = y;
774:   }

776:   idx   = a->j;
777:   v     = a->a;
778:   ii    = a->i;

780:   for (i=0; i<mbs; i++) {
781:     n  = ii[1] - ii[0]; ii++;
782:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
783:     for (j=0; j<n; j++) {
784:       xb = x + 6*(*idx++);
785:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
786:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
787:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
788:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
789:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
790:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
791:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
792:       v += 36;
793:     }
794:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
795:     z += 6; y += 6;
796:   }
797:   VecRestoreArray(xx,&x);
798:   VecRestoreArray(yy,&y);
799:   if (zz != yy) {
800:     VecRestoreArray(zz,&z);
801:   }
802:   PetscLogFlops(72*a->nz);
803:   return(0);
804: }

806: #undef __FUNCT__  
808: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
809: {
810:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
811:   PetscScalar     *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
812:   PetscScalar     x1,x2,x3,x4,x5,x6,x7;
813:   MatScalar       *v;
814:   int             ierr,mbs=a->mbs,i,*idx,*ii,j,n;

817:   VecGetArray(xx,&x);
818:   VecGetArray(yy,&y);
819:   if (zz != yy) {
820:     VecGetArray(zz,&z);
821:   } else {
822:     z = y;
823:   }

825:   idx   = a->j;
826:   v     = a->a;
827:   ii    = a->i;

829:   for (i=0; i<mbs; i++) {
830:     n  = ii[1] - ii[0]; ii++;
831:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
832:     for (j=0; j<n; j++) {
833:       xb = x + 7*(*idx++);
834:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
835:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
836:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
837:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
838:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
839:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
840:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
841:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
842:       v += 49;
843:     }
844:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
845:     z += 7; y += 7;
846:   }
847:   VecRestoreArray(xx,&x);
848:   VecRestoreArray(yy,&y);
849:   if (zz != yy) {
850:     VecRestoreArray(zz,&z);
851:   }
852:   PetscLogFlops(98*a->nz);
853:   return(0);
854: }

856: #undef __FUNCT__  
858: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
859: {
860:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
861:   PetscScalar    *x,*z,*xb,*work,*workt,*y;
862:   MatScalar      *v;
863:   int            mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
864:   int            ncols,k;

867:   VecGetArray(xx,&x);
868:   VecGetArray(zz,&z);
869:   if (zz != yy) {
870:     VecGetArrayFast(yy,&y);
871:     PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
872:     VecRestoreArrayFast(yy,&y);
873:   }

875:   idx   = a->j;
876:   v     = a->a;
877:   ii    = a->i;


880:   if (!a->mult_work) {
881:     k    = PetscMax(A->m,A->n);
882:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
883:   }
884:   work = a->mult_work;
885:   for (i=0; i<mbs; i++) {
886:     n     = ii[1] - ii[0]; ii++;
887:     ncols = n*bs;
888:     workt = work;
889:     for (j=0; j<n; j++) {
890:       xb = x + bs*(*idx++);
891:       for (k=0; k<bs; k++) workt[k] = xb[k];
892:       workt += bs;
893:     }
894:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
895:     /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
896:     v += n*bs2;
897:     z += bs;
898:   }
899:   VecRestoreArray(xx,&x);
900:   VecRestoreArray(zz,&z);
901:   PetscLogFlops(2*a->nz*bs2);
902:   return(0);
903: }

905: #undef __FUNCT__  
907: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
908: {
909:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
910:   PetscScalar     *xg,*zg,*zb,zero = 0.0;
911:   PetscScalar     *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
912:   MatScalar       *v;
913:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
914:   int             bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

917:   VecSet(&zero,zz);
918:   VecGetArray(xx,&xg); x = xg;
919:   VecGetArray(zz,&zg); z = zg;

921:   idx   = a->j;
922:   v     = a->a;
923:   ii    = a->i;
924:   xb    = x;
925:   switch (bs) {
926:   case 1:
927:     for (i=0; i<mbs; i++) {
928:       n  = ii[1] - ii[0]; ii++;
929:       x1 = xb[0];
930:       ib = idx + ai[i];
931:       for (j=0; j<n; j++) {
932:         rval    = ib[j];
933:         z[rval] += *v * x1;
934:         v++;
935:       }
936:       xb++;
937:     }
938:     break;
939:   case 2:
940:     for (i=0; i<mbs; i++) {
941:       n  = ii[1] - ii[0]; ii++;
942:       x1 = xb[0]; x2 = xb[1];
943:       ib = idx + ai[i];
944:       for (j=0; j<n; j++) {
945:         rval      = ib[j]*2;
946:         z[rval++] += v[0]*x1 + v[1]*x2;
947:         z[rval]   += v[2]*x1 + v[3]*x2;
948:         v  += 4;
949:       }
950:       xb += 2;
951:     }
952:     break;
953:   case 3:
954:     for (i=0; i<mbs; i++) {
955:       n  = ii[1] - ii[0]; ii++;
956:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
957:       ib = idx + ai[i];
958:       for (j=0; j<n; j++) {
959:         rval      = ib[j]*3;
960:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
961:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
962:         z[rval]   += v[6]*x1 + v[7]*x2 + v[8]*x3;
963:         v  += 9;
964:       }
965:       xb += 3;
966:     }
967:     break;
968:   case 4:
969:     for (i=0; i<mbs; i++) {
970:       n  = ii[1] - ii[0]; ii++;
971:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
972:       ib = idx + ai[i];
973:       for (j=0; j<n; j++) {
974:         rval      = ib[j]*4;
975:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
976:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
977:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
978:         z[rval]   += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
979:         v  += 16;
980:       }
981:       xb += 4;
982:     }
983:     break;
984:   case 5:
985:     for (i=0; i<mbs; i++) {
986:       n  = ii[1] - ii[0]; ii++;
987:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
988:       x4 = xb[3]; x5 = xb[4];
989:       ib = idx + ai[i];
990:       for (j=0; j<n; j++) {
991:         rval      = ib[j]*5;
992:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
993:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
994:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
995:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
996:         z[rval]   += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
997:         v  += 25;
998:       }
999:       xb += 5;
1000:     }
1001:     break;
1002:   case 6:
1003:     for (i=0; i<mbs; i++) {
1004:       n  = ii[1] - ii[0]; ii++;
1005:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1006:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1007:       ib = idx + ai[i];
1008:       for (j=0; j<n; j++) {
1009:         rval      = ib[j]*6;
1010:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6;
1011:         z[rval++] +=  v[6]*x1 +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1012:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1013:         z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1014:         z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1015:         z[rval]   += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1016:         v  += 36;
1017:       }
1018:       xb += 6;
1019:     }
1020:     break;
1021:   case 7:
1022:     for (i=0; i<mbs; i++) {
1023:       n  = ii[1] - ii[0]; ii++;
1024:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1025:       x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1026:       ib = idx + ai[i];
1027:       for (j=0; j<n; j++) {
1028:         rval      = ib[j]*7;
1029:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1030:         z[rval++] +=  v[7]*x1 +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1031:         z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1032:         z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1033:         z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1034:         z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1035:         z[rval]   += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1036:         v  += 49;
1037:       }
1038:       xb += 7;
1039:     }
1040:     break;
1041:   default: {       /* block sizes larger then 7 by 7 are handled by BLAS */
1042:       int          ncols,k;
1043:       PetscScalar  *work,*workt;

1045:       if (!a->mult_work) {
1046:         k = PetscMax(A->m,A->n);
1047:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1048:       }
1049:       work = a->mult_work;
1050:       for (i=0; i<mbs; i++) {
1051:         n     = ii[1] - ii[0]; ii++;
1052:         ncols = n*bs;
1053:         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));
1054:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1055:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1056:         v += n*bs2;
1057:         x += bs;
1058:         workt = work;
1059:         for (j=0; j<n; j++) {
1060:           zb = z + bs*(*idx++);
1061:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1062:           workt += bs;
1063:         }
1064:       }
1065:     }
1066:   }
1067:   VecRestoreArray(xx,&xg);
1068:   VecRestoreArray(zz,&zg);
1069:   PetscLogFlops(2*a->nz*a->bs2 - A->n);
1070:   return(0);
1071: }

1073: #undef __FUNCT__  
1075: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1077: {
1078:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ*)A->data;
1079:   PetscScalar     *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1080:   MatScalar       *v;
1081:   int             mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;

1084:   if (yy != zz) { VecCopy(yy,zz); }
1085:   VecGetArray(xx,&xg); x = xg;
1086:   VecGetArray(zz,&zg); z = zg;


1089:   idx   = a->j;
1090:   v     = a->a;
1091:   ii    = a->i;
1092:   xb    = x;

1094:   switch (bs) {
1095:   case 1:
1096:     for (i=0; i<mbs; i++) {
1097:       n  = ii[1] - ii[0]; ii++;
1098:       x1 = xb[0];
1099:       ib = idx + ai[i];
1100:       for (j=0; j<n; j++) {
1101:         rval    = ib[j];
1102:         z[rval] += *v * x1;
1103:         v++;
1104:       }
1105:       xb++;
1106:     }
1107:     break;
1108:   case 2:
1109:     for (i=0; i<mbs; i++) {
1110:       n  = ii[1] - ii[0]; ii++;
1111:       x1 = xb[0]; x2 = xb[1];
1112:       ib = idx + ai[i];
1113:       for (j=0; j<n; j++) {
1114:         rval      = ib[j]*2;
1115:         z[rval++] += v[0]*x1 + v[1]*x2;
1116:         z[rval++] += v[2]*x1 + v[3]*x2;
1117:         v  += 4;
1118:       }
1119:       xb += 2;
1120:     }
1121:     break;
1122:   case 3:
1123:     for (i=0; i<mbs; i++) {
1124:       n  = ii[1] - ii[0]; ii++;
1125:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1126:       ib = idx + ai[i];
1127:       for (j=0; j<n; j++) {
1128:         rval      = ib[j]*3;
1129:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1130:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1131:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1132:         v  += 9;
1133:       }
1134:       xb += 3;
1135:     }
1136:     break;
1137:   case 4:
1138:     for (i=0; i<mbs; i++) {
1139:       n  = ii[1] - ii[0]; ii++;
1140:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1141:       ib = idx + ai[i];
1142:       for (j=0; j<n; j++) {
1143:         rval      = ib[j]*4;
1144:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1145:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1146:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1147:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1148:         v  += 16;
1149:       }
1150:       xb += 4;
1151:     }
1152:     break;
1153:   case 5:
1154:     for (i=0; i<mbs; i++) {
1155:       n  = ii[1] - ii[0]; ii++;
1156:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1157:       x4 = xb[3]; x5 = xb[4];
1158:       ib = idx + ai[i];
1159:       for (j=0; j<n; j++) {
1160:         rval      = ib[j]*5;
1161:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1162:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1163:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1164:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1165:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1166:         v  += 25;
1167:       }
1168:       xb += 5;
1169:     }
1170:     break;
1171:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1172:       int          ncols,k;
1173:       PetscScalar  *work,*workt;

1175:       if (!a->mult_work) {
1176:         k = PetscMax(A->m,A->n);
1177:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1178:       }
1179:       work = a->mult_work;
1180:       for (i=0; i<mbs; i++) {
1181:         n     = ii[1] - ii[0]; ii++;
1182:         ncols = n*bs;
1183:         ierr  = PetscMemzero(work,ncols*sizeof(PetscScalar));
1184:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1185:         /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1186:         v += n*bs2;
1187:         x += bs;
1188:         workt = work;
1189:         for (j=0; j<n; j++) {
1190:           zb = z + bs*(*idx++);
1191:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1192:           workt += bs;
1193:         }
1194:       }
1195:     }
1196:   }
1197:   VecRestoreArray(xx,&xg);
1198:   VecRestoreArray(zz,&zg);
1199:   PetscLogFlops(2*a->nz*a->bs2);
1200:   return(0);
1201: }

1203: #undef __FUNCT__  
1205: int MatScale_SeqBAIJ(PetscScalar *alpha,Mat inA)
1206: {
1207:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1208:   int         totalnz = a->bs2*a->nz;
1209: #if defined(PETSC_USE_MAT_SINGLE)
1210:   int         i;
1211: #else
1212:   int         one = 1;
1213: #endif

1216: #if defined(PETSC_USE_MAT_SINGLE)
1217:   for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1218: #else
1219:   BLscal_(&totalnz,alpha,a->a,&one);
1220: #endif
1221:   PetscLogFlops(totalnz);
1222:   return(0);
1223: }

1225: #undef __FUNCT__  
1227: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1228: {
1229:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1230:   MatScalar   *v = a->a;
1231:   PetscReal   sum = 0.0;
1232:   int         i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;

1235:   if (type == NORM_FROBENIUS) {
1236:     for (i=0; i< bs2*nz; i++) {
1237: #if defined(PETSC_USE_COMPLEX)
1238:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1239: #else
1240:       sum += (*v)*(*v); v++;
1241: #endif
1242:     }
1243:     *norm = sqrt(sum);
1244:   }  else if (type == NORM_INFINITY) { /* maximum row sum */
1245:     *norm = 0.0;
1246:     for (k=0; k<bs; k++) {
1247:       for (j=0; j<a->mbs; j++) {
1248:         v = a->a + bs2*a->i[j] + k;
1249:         sum = 0.0;
1250:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1251:           for (k1=0; k1<bs; k1++){
1252:             sum += PetscAbsScalar(*v);
1253:             v   += bs;
1254:           }
1255:         }
1256:         if (sum > *norm) *norm = sum;
1257:       }
1258:     }
1259:   } else {
1260:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1261:   }
1262:   return(0);
1263: }


1266: #undef __FUNCT__  
1268: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1269: {
1270:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1271:   int         ierr;
1272:   PetscTruth  flag;

1275:   PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1276:   if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");

1278:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1279:   if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1280:     *flg = PETSC_FALSE;
1281:     return(0);
1282:   }
1283: 
1284:   /* if the a->i are the same */
1285:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1286:   if (*flg == PETSC_FALSE) {
1287:     return(0);
1288:   }
1289: 
1290:   /* if a->j are the same */
1291:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1292:   if (*flg == PETSC_FALSE) {
1293:     return(0);
1294:   }
1295:   /* if a->a are the same */
1296:   PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1297:   return(0);
1298: 
1299: }

1301: #undef __FUNCT__  
1303: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1304: {
1305:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1306:   int          ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1307:   PetscScalar  *x,zero = 0.0;
1308:   MatScalar    *aa,*aa_j;

1311:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1312:   bs   = a->bs;
1313:   aa   = a->a;
1314:   ai   = a->i;
1315:   aj   = a->j;
1316:   ambs = a->mbs;
1317:   bs2  = a->bs2;

1319:   VecSet(&zero,v);
1320:   VecGetArray(v,&x);
1321:   VecGetLocalSize(v,&n);
1322:   if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1323:   for (i=0; i<ambs; i++) {
1324:     for (j=ai[i]; j<ai[i+1]; j++) {
1325:       if (aj[j] == i) {
1326:         row  = i*bs;
1327:         aa_j = aa+j*bs2;
1328:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1329:         break;
1330:       }
1331:     }
1332:   }
1333:   VecRestoreArray(v,&x);
1334:   return(0);
1335: }

1337: #undef __FUNCT__  
1339: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1340: {
1341:   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data;
1342:   PetscScalar  *l,*r,x,*li,*ri;
1343:   MatScalar    *aa,*v;
1344:   int          ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

1347:   ai  = a->i;
1348:   aj  = a->j;
1349:   aa  = a->a;
1350:   m   = A->m;
1351:   n   = A->n;
1352:   bs  = a->bs;
1353:   mbs = a->mbs;
1354:   bs2 = a->bs2;
1355:   if (ll) {
1356:     VecGetArray(ll,&l);
1357:     VecGetLocalSize(ll,&lm);
1358:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1359:     for (i=0; i<mbs; i++) { /* for each block row */
1360:       M  = ai[i+1] - ai[i];
1361:       li = l + i*bs;
1362:       v  = aa + bs2*ai[i];
1363:       for (j=0; j<M; j++) { /* for each block */
1364:         for (k=0; k<bs2; k++) {
1365:           (*v++) *= li[k%bs];
1366:         }
1367:       }
1368:     }
1369:     VecRestoreArray(ll,&l);
1370:     PetscLogFlops(a->nz);
1371:   }
1372: 
1373:   if (rr) {
1374:     VecGetArray(rr,&r);
1375:     VecGetLocalSize(rr,&rn);
1376:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1377:     for (i=0; i<mbs; i++) { /* for each block row */
1378:       M  = ai[i+1] - ai[i];
1379:       v  = aa + bs2*ai[i];
1380:       for (j=0; j<M; j++) { /* for each block */
1381:         ri = r + bs*aj[ai[i]+j];
1382:         for (k=0; k<bs; k++) {
1383:           x = ri[k];
1384:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1385:         }
1386:       }
1387:     }
1388:     VecRestoreArray(rr,&r);
1389:     PetscLogFlops(a->nz);
1390:   }
1391:   return(0);
1392: }


1395: #undef __FUNCT__  
1397: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1398: {
1399:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

1402:   info->rows_global    = (double)A->m;
1403:   info->columns_global = (double)A->n;
1404:   info->rows_local     = (double)A->m;
1405:   info->columns_local  = (double)A->n;
1406:   info->block_size     = a->bs2;
1407:   info->nz_allocated   = a->maxnz;
1408:   info->nz_used        = a->bs2*a->nz;
1409:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
1410:   info->assemblies   = A->num_ass;
1411:   info->mallocs      = a->reallocs;
1412:   info->memory       = A->mem;
1413:   if (A->factor) {
1414:     info->fill_ratio_given  = A->info.fill_ratio_given;
1415:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1416:     info->factor_mallocs    = A->info.factor_mallocs;
1417:   } else {
1418:     info->fill_ratio_given  = 0;
1419:     info->fill_ratio_needed = 0;
1420:     info->factor_mallocs    = 0;
1421:   }
1422:   return(0);
1423: }


1426: #undef __FUNCT__  
1428: int MatZeroEntries_SeqBAIJ(Mat A)
1429: {
1430:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1431:   int         ierr;

1434:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1435:   return(0);
1436: }