Actual source code: baij2.c
1: /*$Id: baij2.c,v 1.72 2001/03/23 23:22:07 balay Exp $*/
3: #include "petscsys.h"
4: #include "src/mat/impls/baij/seq/baij.h"
5: #include "src/vec/vecimpl.h"
6: #include "src/inline/spops.h"
7: #include "src/inline/ilu.h"
8: #include "petscbt.h"
10: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
14: int start,end,*ai,*aj,bs,*nidx2;
15: PetscBT table;
18: m = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = a->bs;
23: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: PetscBTCreate(m,table);
26: PetscMalloc((m+1)*sizeof(int),&nidx);
27: PetscMalloc((A->m+1)*sizeof(int),&nidx2);
29: for (i=0; i<is_max; i++) {
30: /* Initialise the two local arrays */
31: isz = 0;
32: PetscBTMemzero(m,table);
33:
34: /* Extract the indices, assume there can be duplicate entries */
35: ISGetIndices(is[i],&idx);
36: ISGetLocalSize(is[i],&n);
38: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
39: for (j=0; j<n ; ++j){
40: ival = idx[j]/bs; /* convert the indices into block indices */
41: if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
43: }
44: ISRestoreIndices(is[i],&idx);
45: ISDestroy(is[i]);
46:
47: k = 0;
48: for (j=0; j<ov; j++){ /* for each overlap*/
49: n = isz;
50: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
51: row = nidx[k];
52: start = ai[row];
53: end = ai[row+1];
54: for (l = start; l<end ; l++){
55: val = aj[l];
56: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
57: }
58: }
59: }
60: /* expand the Index Set */
61: for (j=0; j<isz; j++) {
62: for (k=0; k<bs; k++)
63: nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
66: }
67: PetscBTDestroy(table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
73: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
74: {
75: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
76: int *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
77: int row,mat_i,*mat_j,tcol,*mat_ilen;
78: int *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
79: int *aj = a->j,*ai = a->i;
80: MatScalar *mat_a;
81: Mat C;
82: PetscTruth flag;
85: ISSorted(iscol,(PetscTruth*)&i);
86: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
88: ISGetIndices(isrow,&irow);
89: ISGetIndices(iscol,&icol);
90: ISGetLocalSize(isrow,&nrows);
91: ISGetLocalSize(iscol,&ncols);
93: PetscMalloc((1+oldcols)*sizeof(int),&smap);
94: ssmap = smap;
95: PetscMalloc((1+nrows)*sizeof(int),&lens);
96: ierr = PetscMemzero(smap,oldcols*sizeof(int));
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) {
105: lens[i]++;
106: }
107: }
108: }
109: /* Create and fill new matrix */
110: if (scall == MAT_REUSE_MATRIX) {
111: c = (Mat_SeqBAIJ *)((*B)->data);
113: if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
114: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
115: if (flag == PETSC_FALSE) {
116: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
117: }
118: PetscMemzero(c->ilen,c->mbs*sizeof(int));
119: C = *B;
120: } else {
121: MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
122: }
123: c = (Mat_SeqBAIJ *)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141:
142: /* Free work space */
143: ISRestoreIndices(iscol,&icol);
144: PetscFree(smap);
145: PetscFree(lens);
146: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
147: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
148:
149: ISRestoreIndices(isrow,&irow);
150: *B = C;
151: return(0);
152: }
154: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
155: {
156: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
157: IS is1,is2;
158: int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
161: ISGetIndices(isrow,&irow);
162: ISGetIndices(iscol,&icol);
163: ISGetLocalSize(isrow,&nrows);
164: ISGetLocalSize(iscol,&ncols);
165:
166: /* Verify if the indices corespond to each element in a block
167: and form the IS with compressed IS */
168: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
169: iary = vary + a->mbs;
170: PetscMemzero(vary,(a->mbs)*sizeof(int));
171: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
172: count = 0;
173: for (i=0; i<a->mbs; i++) {
174: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
175: if (vary[i]==bs) iary[count++] = i;
176: }
177: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
178:
179: PetscMemzero(vary,(a->mbs)*sizeof(int));
180: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
181: count = 0;
182: for (i=0; i<a->mbs; i++) {
183: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
184: if (vary[i]==bs) iary[count++] = i;
185: }
186: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
187: ISRestoreIndices(isrow,&irow);
188: ISRestoreIndices(iscol,&icol);
189: PetscFree(vary);
191: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
192: ISDestroy(is1);
193: ISDestroy(is2);
194: return(0);
195: }
197: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
198: {
199: int ierr,i;
202: if (scall == MAT_INITIAL_MATRIX) {
203: PetscMalloc((n+1)*sizeof(Mat),B);
204: }
206: for (i=0; i<n; i++) {
207: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
208: }
209: return(0);
210: }
213: /* -------------------------------------------------------*/
214: /* Should check that shapes of vectors and matrices match */
215: /* -------------------------------------------------------*/
216: #include "petscblaslapack.h"
218: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
219: {
220: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
221: Scalar *x,*z,sum;
222: MatScalar *v;
223: int mbs=a->mbs,i,*idx,*ii,n,ierr;
226: VecGetArray(xx,&x);
227: VecGetArray(zz,&z);
229: idx = a->j;
230: v = a->a;
231: ii = a->i;
233: for (i=0; i<mbs; i++) {
234: n = ii[1] - ii[0]; ii++;
235: sum = 0.0;
236: while (n--) sum += *v++ * x[*idx++];
237: z[i] = sum;
238: }
239: VecRestoreArray(xx,&x);
240: VecRestoreArray(zz,&z);
241: PetscLogFlops(2*a->nz - A->m);
242: return(0);
243: }
245: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
246: {
247: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
248: Scalar *x,*z,*xb,sum1,sum2;
249: Scalar x1,x2;
250: MatScalar *v;
251: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
254: VecGetArray(xx,&x);
255: VecGetArray(zz,&z);
257: idx = a->j;
258: v = a->a;
259: ii = a->i;
261: for (i=0; i<mbs; i++) {
262: n = ii[1] - ii[0]; ii++;
263: sum1 = 0.0; sum2 = 0.0;
264: for (j=0; j<n; j++) {
265: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
266: sum1 += v[0]*x1 + v[2]*x2;
267: sum2 += v[1]*x1 + v[3]*x2;
268: v += 4;
269: }
270: z[0] = sum1; z[1] = sum2;
271: z += 2;
272: }
273: VecRestoreArray(xx,&x);
274: VecRestoreArray(zz,&z);
275: PetscLogFlops(8*a->nz - A->m);
276: return(0);
277: }
279: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
280: {
281: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
282: Scalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
283: MatScalar *v;
284: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
286: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
287: #pragma disjoint(*v,*z,*xb)
288: #endif
291: VecGetArray(xx,&x);
292: VecGetArray(zz,&z);
294: idx = a->j;
295: v = a->a;
296: ii = a->i;
298: for (i=0; i<mbs; i++) {
299: n = ii[1] - ii[0]; ii++;
300: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
301: for (j=0; j<n; j++) {
302: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
303: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
304: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
305: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
306: v += 9;
307: }
308: z[0] = sum1; z[1] = sum2; z[2] = sum3;
309: z += 3;
310: }
311: VecRestoreArray(xx,&x);
312: VecRestoreArray(zz,&z);
313: PetscLogFlops(18*a->nz - A->m);
314: return(0);
315: }
317: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
318: {
319: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
320: Scalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
321: MatScalar *v;
322: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
325: VecGetArray(xx,&x);
326: VecGetArray(zz,&z);
328: idx = a->j;
329: v = a->a;
330: ii = a->i;
332: for (i=0; i<mbs; i++) {
333: n = ii[1] - ii[0]; ii++;
334: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
335: for (j=0; j<n; j++) {
336: xb = x + 4*(*idx++);
337: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
338: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
339: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
340: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
341: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
342: v += 16;
343: }
344: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
345: z += 4;
346: }
347: VecRestoreArray(xx,&x);
348: VecRestoreArray(zz,&z);
349: PetscLogFlops(32*a->nz - A->m);
350: return(0);
351: }
353: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
354: {
355: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
356: Scalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
357: MatScalar *v;
358: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
361: VecGetArray(xx,&x);
362: VecGetArray(zz,&z);
364: idx = a->j;
365: v = a->a;
366: ii = a->i;
368: for (i=0; i<mbs; i++) {
369: n = ii[1] - ii[0]; ii++;
370: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
371: for (j=0; j<n; j++) {
372: xb = x + 5*(*idx++);
373: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
374: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
375: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
376: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
377: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
378: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
379: v += 25;
380: }
381: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
382: z += 5;
383: }
384: VecRestoreArray(xx,&x);
385: VecRestoreArray(zz,&z);
386: PetscLogFlops(50*a->nz - A->m);
387: return(0);
388: }
391: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
392: {
393: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
394: Scalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
395: Scalar x1,x2,x3,x4,x5,x6;
396: MatScalar *v;
397: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
400: VecGetArray(xx,&x);
401: VecGetArray(zz,&z);
403: idx = a->j;
404: v = a->a;
405: ii = a->i;
407: for (i=0; i<mbs; i++) {
408: n = ii[1] - ii[0]; ii++;
409: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
410: for (j=0; j<n; j++) {
411: xb = x + 6*(*idx++);
412: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
413: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
414: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
415: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
416: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
417: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
418: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
419: v += 36;
420: }
421: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
422: z += 6;
423: }
425: VecRestoreArray(xx,&x);
426: VecRestoreArray(zz,&z);
427: PetscLogFlops(72*a->nz - A->m);
428: return(0);
429: }
430: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
431: {
432: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
433: Scalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
434: Scalar x1,x2,x3,x4,x5,x6,x7;
435: MatScalar *v;
436: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
439: VecGetArray(xx,&x);
440: VecGetArray(zz,&z);
442: idx = a->j;
443: v = a->a;
444: ii = a->i;
446: for (i=0; i<mbs; i++) {
447: n = ii[1] - ii[0]; ii++;
448: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
449: for (j=0; j<n; j++) {
450: xb = x + 7*(*idx++);
451: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
452: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
453: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
454: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
455: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
456: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
457: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
458: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
459: v += 49;
460: }
461: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
462: z += 7;
463: }
465: VecRestoreArray(xx,&x);
466: VecRestoreArray(zz,&z);
467: PetscLogFlops(98*a->nz - A->m);
468: return(0);
469: }
471: /*
472: This will not work with MatScalar == float because it calls the BLAS
473: */
474: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
475: {
476: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
477: Scalar *x,*z,*xb,*work,*workt;
478: MatScalar *v;
479: int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
480: int ncols,k;
483: VecGetArray(xx,&x);
484: VecGetArray(zz,&z);
486: idx = a->j;
487: v = a->a;
488: ii = a->i;
491: if (!a->mult_work) {
492: k = PetscMax(A->m,A->n);
493: PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
494: }
495: work = a->mult_work;
496: for (i=0; i<mbs; i++) {
497: n = ii[1] - ii[0]; ii++;
498: ncols = n*bs;
499: workt = work;
500: for (j=0; j<n; j++) {
501: xb = x + bs*(*idx++);
502: for (k=0; k<bs; k++) workt[k] = xb[k];
503: workt += bs;
504: }
505: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
506: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
507: v += n*bs2;
508: z += bs;
509: }
510: VecRestoreArray(xx,&x);
511: VecRestoreArray(zz,&z);
512: PetscLogFlops(2*a->nz*bs2 - A->m);
513: return(0);
514: }
516: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
517: {
518: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
519: Scalar *x,*y,*z,sum;
520: MatScalar *v;
521: int ierr,mbs=a->mbs,i,*idx,*ii,n;
524: VecGetArray(xx,&x);
525: VecGetArray(yy,&y);
526: if (zz != yy) {
527: VecGetArray(zz,&z);
528: } else {
529: z = y;
530: }
532: idx = a->j;
533: v = a->a;
534: ii = a->i;
536: for (i=0; i<mbs; i++) {
537: n = ii[1] - ii[0]; ii++;
538: sum = y[i];
539: while (n--) sum += *v++ * x[*idx++];
540: z[i] = sum;
541: }
542: VecRestoreArray(xx,&x);
543: VecRestoreArray(yy,&y);
544: if (zz != yy) {
545: VecRestoreArray(zz,&z);
546: }
547: PetscLogFlops(2*a->nz);
548: return(0);
549: }
551: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
552: {
553: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
554: Scalar *x,*y,*z,*xb,sum1,sum2;
555: Scalar x1,x2;
556: MatScalar *v;
557: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
560: VecGetArray(xx,&x);
561: VecGetArray(yy,&y);
562: if (zz != yy) {
563: VecGetArray(zz,&z);
564: } else {
565: z = y;
566: }
568: idx = a->j;
569: v = a->a;
570: ii = a->i;
572: for (i=0; i<mbs; i++) {
573: n = ii[1] - ii[0]; ii++;
574: sum1 = y[0]; sum2 = y[1];
575: for (j=0; j<n; j++) {
576: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
577: sum1 += v[0]*x1 + v[2]*x2;
578: sum2 += v[1]*x1 + v[3]*x2;
579: v += 4;
580: }
581: z[0] = sum1; z[1] = sum2;
582: z += 2; y += 2;
583: }
584: VecRestoreArray(xx,&x);
585: VecRestoreArray(yy,&y);
586: if (zz != yy) {
587: VecRestoreArray(zz,&z);
588: }
589: PetscLogFlops(4*a->nz);
590: return(0);
591: }
593: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
594: {
595: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
596: Scalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
597: MatScalar *v;
598: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
601: VecGetArray(xx,&x);
602: VecGetArray(yy,&y);
603: if (zz != yy) {
604: VecGetArray(zz,&z);
605: } else {
606: z = y;
607: }
609: idx = a->j;
610: v = a->a;
611: ii = a->i;
613: for (i=0; i<mbs; i++) {
614: n = ii[1] - ii[0]; ii++;
615: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
616: for (j=0; j<n; j++) {
617: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
618: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
619: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
620: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
621: v += 9;
622: }
623: z[0] = sum1; z[1] = sum2; z[2] = sum3;
624: z += 3; y += 3;
625: }
626: VecRestoreArray(xx,&x);
627: VecRestoreArray(yy,&y);
628: if (zz != yy) {
629: VecRestoreArray(zz,&z);
630: }
631: PetscLogFlops(18*a->nz);
632: return(0);
633: }
635: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
636: {
637: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
638: Scalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
639: MatScalar *v;
640: int ierr,mbs=a->mbs,i,*idx,*ii;
641: int j,n;
644: VecGetArray(xx,&x);
645: VecGetArray(yy,&y);
646: if (zz != yy) {
647: VecGetArray(zz,&z);
648: } else {
649: z = y;
650: }
652: idx = a->j;
653: v = a->a;
654: ii = a->i;
656: for (i=0; i<mbs; i++) {
657: n = ii[1] - ii[0]; ii++;
658: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
659: for (j=0; j<n; j++) {
660: xb = x + 4*(*idx++);
661: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
662: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
663: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
664: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
665: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
666: v += 16;
667: }
668: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
669: z += 4; y += 4;
670: }
671: VecRestoreArray(xx,&x);
672: VecRestoreArray(yy,&y);
673: if (zz != yy) {
674: VecRestoreArray(zz,&z);
675: }
676: PetscLogFlops(32*a->nz);
677: return(0);
678: }
680: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
681: {
682: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
683: Scalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
684: MatScalar *v;
685: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
688: VecGetArray(xx,&x);
689: VecGetArray(yy,&y);
690: if (zz != yy) {
691: VecGetArray(zz,&z);
692: } else {
693: z = y;
694: }
696: idx = a->j;
697: v = a->a;
698: ii = a->i;
700: for (i=0; i<mbs; i++) {
701: n = ii[1] - ii[0]; ii++;
702: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
703: for (j=0; j<n; j++) {
704: xb = x + 5*(*idx++);
705: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
706: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
707: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
708: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
709: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
710: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
711: v += 25;
712: }
713: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
714: z += 5; y += 5;
715: }
716: VecRestoreArray(xx,&x);
717: VecRestoreArray(yy,&y);
718: if (zz != yy) {
719: VecRestoreArray(zz,&z);
720: }
721: PetscLogFlops(50*a->nz);
722: return(0);
723: }
724: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
725: {
726: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
727: Scalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
728: Scalar x1,x2,x3,x4,x5,x6;
729: MatScalar *v;
730: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
733: VecGetArray(xx,&x);
734: VecGetArray(yy,&y);
735: if (zz != yy) {
736: VecGetArray(zz,&z);
737: } else {
738: z = y;
739: }
741: idx = a->j;
742: v = a->a;
743: ii = a->i;
745: for (i=0; i<mbs; i++) {
746: n = ii[1] - ii[0]; ii++;
747: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
748: for (j=0; j<n; j++) {
749: xb = x + 6*(*idx++);
750: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
751: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
752: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
753: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
754: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
755: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
756: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
757: v += 36;
758: }
759: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
760: z += 6; y += 6;
761: }
762: VecRestoreArray(xx,&x);
763: VecRestoreArray(yy,&y);
764: if (zz != yy) {
765: VecRestoreArray(zz,&z);
766: }
767: PetscLogFlops(72*a->nz);
768: return(0);
769: }
771: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
772: {
773: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
774: Scalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
775: Scalar x1,x2,x3,x4,x5,x6,x7;
776: MatScalar *v;
777: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
780: VecGetArray(xx,&x);
781: VecGetArray(yy,&y);
782: if (zz != yy) {
783: VecGetArray(zz,&z);
784: } else {
785: z = y;
786: }
788: idx = a->j;
789: v = a->a;
790: ii = a->i;
792: for (i=0; i<mbs; i++) {
793: n = ii[1] - ii[0]; ii++;
794: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
795: for (j=0; j<n; j++) {
796: xb = x + 7*(*idx++);
797: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
798: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
799: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
800: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
801: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
802: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
803: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
804: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
805: v += 49;
806: }
807: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
808: z += 7; y += 7;
809: }
810: VecRestoreArray(xx,&x);
811: VecRestoreArray(yy,&y);
812: if (zz != yy) {
813: VecRestoreArray(zz,&z);
814: }
815: PetscLogFlops(98*a->nz);
816: return(0);
817: }
819: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
820: {
821: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
822: Scalar *x,*z,*xb,*work,*workt;
823: MatScalar *v;
824: int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
825: int ncols,k;
828: if (xx != yy) { VecCopy(yy,zz); }
830: VecGetArray(xx,&x);
831: VecGetArray(zz,&z);
832:
833: idx = a->j;
834: v = a->a;
835: ii = a->i;
838: if (!a->mult_work) {
839: k = PetscMax(A->m,A->n);
840: PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
841: }
842: work = a->mult_work;
843: for (i=0; i<mbs; i++) {
844: n = ii[1] - ii[0]; ii++;
845: ncols = n*bs;
846: workt = work;
847: for (j=0; j<n; j++) {
848: xb = x + bs*(*idx++);
849: for (k=0; k<bs; k++) workt[k] = xb[k];
850: workt += bs;
851: }
852: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
853: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
854: v += n*bs2;
855: z += bs;
856: }
857: VecRestoreArray(xx,&x);
858: VecRestoreArray(zz,&z);
859: PetscLogFlops(2*a->nz*bs2);
860: return(0);
861: }
863: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
864: {
865: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
866: Scalar *xg,*zg,*zb,zero = 0.0;
867: Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
868: MatScalar *v;
869: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
870: int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
873: VecSet(&zero,zz);
874: VecGetArray(xx,&xg); x = xg;
875: VecGetArray(zz,&zg); z = zg;
877: idx = a->j;
878: v = a->a;
879: ii = a->i;
880: xb = x;
881: switch (bs) {
882: case 1:
883: for (i=0; i<mbs; i++) {
884: n = ii[1] - ii[0]; ii++;
885: x1 = xb[0];
886: ib = idx + ai[i];
887: for (j=0; j<n; j++) {
888: rval = ib[j];
889: z[rval] += *v * x1;
890: v++;
891: }
892: xb++;
893: }
894: break;
895: case 2:
896: for (i=0; i<mbs; i++) {
897: n = ii[1] - ii[0]; ii++;
898: x1 = xb[0]; x2 = xb[1];
899: ib = idx + ai[i];
900: for (j=0; j<n; j++) {
901: rval = ib[j]*2;
902: z[rval++] += v[0]*x1 + v[1]*x2;
903: z[rval] += v[2]*x1 + v[3]*x2;
904: v += 4;
905: }
906: xb += 2;
907: }
908: break;
909: case 3:
910: for (i=0; i<mbs; i++) {
911: n = ii[1] - ii[0]; ii++;
912: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
913: ib = idx + ai[i];
914: for (j=0; j<n; j++) {
915: rval = ib[j]*3;
916: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
917: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
918: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
919: v += 9;
920: }
921: xb += 3;
922: }
923: break;
924: case 4:
925: for (i=0; i<mbs; i++) {
926: n = ii[1] - ii[0]; ii++;
927: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
928: ib = idx + ai[i];
929: for (j=0; j<n; j++) {
930: rval = ib[j]*4;
931: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
932: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
933: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
934: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
935: v += 16;
936: }
937: xb += 4;
938: }
939: break;
940: case 5:
941: for (i=0; i<mbs; i++) {
942: n = ii[1] - ii[0]; ii++;
943: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
944: x4 = xb[3]; x5 = xb[4];
945: ib = idx + ai[i];
946: for (j=0; j<n; j++) {
947: rval = ib[j]*5;
948: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
949: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
950: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
951: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
952: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
953: v += 25;
954: }
955: xb += 5;
956: }
957: break;
958: case 6:
959: for (i=0; i<mbs; i++) {
960: n = ii[1] - ii[0]; ii++;
961: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
962: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
963: ib = idx + ai[i];
964: for (j=0; j<n; j++) {
965: rval = ib[j]*6;
966: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
967: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
968: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
969: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
970: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
971: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
972: v += 36;
973: }
974: xb += 6;
975: }
976: break;
977: case 7:
978: for (i=0; i<mbs; i++) {
979: n = ii[1] - ii[0]; ii++;
980: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
981: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
982: ib = idx + ai[i];
983: for (j=0; j<n; j++) {
984: rval = ib[j]*7;
985: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
986: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
987: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
988: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
989: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
990: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
991: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
992: v += 49;
993: }
994: xb += 7;
995: }
996: break;
997: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
998: int ncols,k;
999: Scalar *work,*workt;
1001: if (!a->mult_work) {
1002: k = PetscMax(A->m,A->n);
1003: PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
1004: }
1005: work = a->mult_work;
1006: for (i=0; i<mbs; i++) {
1007: n = ii[1] - ii[0]; ii++;
1008: ncols = n*bs;
1009: ierr = PetscMemzero(work,ncols*sizeof(Scalar));
1010: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1011: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1012: v += n*bs2;
1013: x += bs;
1014: workt = work;
1015: for (j=0; j<n; j++) {
1016: zb = z + bs*(*idx++);
1017: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1018: workt += bs;
1019: }
1020: }
1021: }
1022: }
1023: VecRestoreArray(xx,&xg);
1024: VecRestoreArray(zz,&zg);
1025: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1026: return(0);
1027: }
1029: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1031: {
1032: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1033: Scalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1034: MatScalar *v;
1035: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1038: VecGetArray(xx,&xg); x = xg;
1039: VecGetArray(zz,&zg); z = zg;
1041: if (yy != zz) { VecCopy(yy,zz); }
1043: idx = a->j;
1044: v = a->a;
1045: ii = a->i;
1046: xb = x;
1048: switch (bs) {
1049: case 1:
1050: for (i=0; i<mbs; i++) {
1051: n = ii[1] - ii[0]; ii++;
1052: x1 = xb[0];
1053: ib = idx + ai[i];
1054: for (j=0; j<n; j++) {
1055: rval = ib[j];
1056: z[rval] += *v * x1;
1057: v++;
1058: }
1059: xb++;
1060: }
1061: break;
1062: case 2:
1063: for (i=0; i<mbs; i++) {
1064: n = ii[1] - ii[0]; ii++;
1065: x1 = xb[0]; x2 = xb[1];
1066: ib = idx + ai[i];
1067: for (j=0; j<n; j++) {
1068: rval = ib[j]*2;
1069: z[rval++] += v[0]*x1 + v[1]*x2;
1070: z[rval++] += v[2]*x1 + v[3]*x2;
1071: v += 4;
1072: }
1073: xb += 2;
1074: }
1075: break;
1076: case 3:
1077: for (i=0; i<mbs; i++) {
1078: n = ii[1] - ii[0]; ii++;
1079: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1080: ib = idx + ai[i];
1081: for (j=0; j<n; j++) {
1082: rval = ib[j]*3;
1083: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1084: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1085: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1086: v += 9;
1087: }
1088: xb += 3;
1089: }
1090: break;
1091: case 4:
1092: for (i=0; i<mbs; i++) {
1093: n = ii[1] - ii[0]; ii++;
1094: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1095: ib = idx + ai[i];
1096: for (j=0; j<n; j++) {
1097: rval = ib[j]*4;
1098: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1099: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1100: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1101: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1102: v += 16;
1103: }
1104: xb += 4;
1105: }
1106: break;
1107: case 5:
1108: for (i=0; i<mbs; i++) {
1109: n = ii[1] - ii[0]; ii++;
1110: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1111: x4 = xb[3]; x5 = xb[4];
1112: ib = idx + ai[i];
1113: for (j=0; j<n; j++) {
1114: rval = ib[j]*5;
1115: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1116: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1117: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1118: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1119: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1120: v += 25;
1121: }
1122: xb += 5;
1123: }
1124: break;
1125: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1126: int ncols,k;
1127: Scalar *work,*workt;
1129: if (!a->mult_work) {
1130: k = PetscMax(A->m,A->n);
1131: PetscMalloc((k+1)*sizeof(Scalar),&a->mult_work);
1132: }
1133: work = a->mult_work;
1134: for (i=0; i<mbs; i++) {
1135: n = ii[1] - ii[0]; ii++;
1136: ncols = n*bs;
1137: ierr = PetscMemzero(work,ncols*sizeof(Scalar));
1138: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1139: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1140: v += n*bs2;
1141: x += bs;
1142: workt = work;
1143: for (j=0; j<n; j++) {
1144: zb = z + bs*(*idx++);
1145: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1146: workt += bs;
1147: }
1148: }
1149: }
1150: }
1151: VecRestoreArray(xx,&xg);
1152: VecRestoreArray(zz,&zg);
1153: PetscLogFlops(2*a->nz*a->bs2);
1154: return(0);
1155: }
1157: int MatScale_SeqBAIJ(Scalar *alpha,Mat inA)
1158: {
1159: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1160: int totalnz = a->bs2*a->nz;
1161: #if defined(PETSC_USE_MAT_SINGLE)
1162: int i;
1163: #else
1164: int one = 1;
1165: #endif
1168: #if defined(PETSC_USE_MAT_SINGLE)
1169: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1170: #else
1171: BLscal_(&totalnz,alpha,a->a,&one);
1172: #endif
1173: PetscLogFlops(totalnz);
1174: return(0);
1175: }
1177: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1178: {
1179: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1180: MatScalar *v = a->a;
1181: PetscReal sum = 0.0;
1182: int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1185: if (type == NORM_FROBENIUS) {
1186: for (i=0; i< bs2*nz; i++) {
1187: #if defined(PETSC_USE_COMPLEX)
1188: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1189: #else
1190: sum += (*v)*(*v); v++;
1191: #endif
1192: }
1193: *norm = sqrt(sum);
1194: } else if (type == NORM_INFINITY) { /* maximum row sum */
1195: *norm = 0.0;
1196: for (k=0; k<bs; k++) {
1197: for (j=0; j<a->mbs; j++) {
1198: v = a->a + bs2*a->i[j] + k;
1199: sum = 0.0;
1200: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1201: for (k1=0; k1<bs; k1++){
1202: sum += PetscAbsScalar(*v);
1203: v += bs;
1204: }
1205: }
1206: if (sum > *norm) *norm = sum;
1207: }
1208: }
1209: } else {
1210: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1211: }
1212: return(0);
1213: }
1216: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1217: {
1218: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1219: int ierr;
1220: PetscTruth flag;
1223: PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1224: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1226: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1227: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1228: *flg = PETSC_FALSE;
1229: return(0);
1230: }
1231:
1232: /* if the a->i are the same */
1233: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1234: if (*flg == PETSC_FALSE) {
1235: return(0);
1236: }
1237:
1238: /* if a->j are the same */
1239: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1240: if (*flg == PETSC_FALSE) {
1241: return(0);
1242: }
1243: /* if a->a are the same */
1244: PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);
1245: return(0);
1246:
1247: }
1249: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1250: {
1251: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1252: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1253: Scalar *x,zero = 0.0;
1254: MatScalar *aa,*aa_j;
1257: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1258: bs = a->bs;
1259: aa = a->a;
1260: ai = a->i;
1261: aj = a->j;
1262: ambs = a->mbs;
1263: bs2 = a->bs2;
1265: VecSet(&zero,v);
1266: VecGetArray(v,&x);
1267: VecGetLocalSize(v,&n);
1268: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1269: for (i=0; i<ambs; i++) {
1270: for (j=ai[i]; j<ai[i+1]; j++) {
1271: if (aj[j] == i) {
1272: row = i*bs;
1273: aa_j = aa+j*bs2;
1274: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1275: break;
1276: }
1277: }
1278: }
1279: VecRestoreArray(v,&x);
1280: return(0);
1281: }
1283: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1284: {
1285: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1286: Scalar *l,*r,x,*li,*ri;
1287: MatScalar *aa,*v;
1288: int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1291: ai = a->i;
1292: aj = a->j;
1293: aa = a->a;
1294: m = A->m;
1295: n = A->n;
1296: bs = a->bs;
1297: mbs = a->mbs;
1298: bs2 = a->bs2;
1299: if (ll) {
1300: VecGetArray(ll,&l);
1301: VecGetLocalSize(ll,&lm);
1302: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1303: for (i=0; i<mbs; i++) { /* for each block row */
1304: M = ai[i+1] - ai[i];
1305: li = l + i*bs;
1306: v = aa + bs2*ai[i];
1307: for (j=0; j<M; j++) { /* for each block */
1308: for (k=0; k<bs2; k++) {
1309: (*v++) *= li[k%bs];
1310: }
1311: }
1312: }
1313: VecRestoreArray(ll,&l);
1314: PetscLogFlops(a->nz);
1315: }
1316:
1317: if (rr) {
1318: VecGetArray(rr,&r);
1319: VecGetLocalSize(rr,&rn);
1320: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1321: for (i=0; i<mbs; i++) { /* for each block row */
1322: M = ai[i+1] - ai[i];
1323: v = aa + bs2*ai[i];
1324: for (j=0; j<M; j++) { /* for each block */
1325: ri = r + bs*aj[ai[i]+j];
1326: for (k=0; k<bs; k++) {
1327: x = ri[k];
1328: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1329: }
1330: }
1331: }
1332: VecRestoreArray(rr,&r);
1333: PetscLogFlops(a->nz);
1334: }
1335: return(0);
1336: }
1339: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1340: {
1341: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1344: info->rows_global = (double)A->m;
1345: info->columns_global = (double)A->n;
1346: info->rows_local = (double)A->m;
1347: info->columns_local = (double)A->n;
1348: info->block_size = a->bs2;
1349: info->nz_allocated = a->maxnz;
1350: info->nz_used = a->bs2*a->nz;
1351: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1352: info->assemblies = A->num_ass;
1353: info->mallocs = a->reallocs;
1354: info->memory = A->mem;
1355: if (A->factor) {
1356: info->fill_ratio_given = A->info.fill_ratio_given;
1357: info->fill_ratio_needed = A->info.fill_ratio_needed;
1358: info->factor_mallocs = A->info.factor_mallocs;
1359: } else {
1360: info->fill_ratio_given = 0;
1361: info->fill_ratio_needed = 0;
1362: info->factor_mallocs = 0;
1363: }
1364: return(0);
1365: }
1368: int MatZeroEntries_SeqBAIJ(Mat A)
1369: {
1370: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1371: int ierr;
1374: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1375: return(0);
1376: }