Actual source code: baij2.c
1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/vec/vecimpl.h
5: #include src/inline/spops.h
6: #include src/inline/ilu.h
7: #include petscbt.h
9: #undef __FUNCT__
11: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
15: int start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = a->bs;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,table);
27: PetscMalloc((m+1)*sizeof(int),&nidx);
28: PetscMalloc((A->m+1)*sizeof(int),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
34:
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n ; ++j){
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(is[i]);
47:
48: k = 0;
49: for (j=0; j<ov; j++){ /* for each overlap*/
50: n = isz;
51: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end ; l++){
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++)
64: nidx2[j*bs+k] = nidx[j]*bs+k;
65: }
66: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
67: }
68: PetscBTDestroy(table);
69: PetscFree(nidx);
70: PetscFree(nidx2);
71: return(0);
72: }
74: #undef __FUNCT__
76: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: int *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
80: int row,mat_i,*mat_j,tcol,*mat_ilen;
81: int *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
82: int *aj = a->j,*ai = a->i;
83: MatScalar *mat_a;
84: Mat C;
85: PetscTruth flag;
88: ISSorted(iscol,(PetscTruth*)&i);
89: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
91: ISGetIndices(isrow,&irow);
92: ISGetIndices(iscol,&icol);
93: ISGetLocalSize(isrow,&nrows);
94: ISGetLocalSize(iscol,&ncols);
96: PetscMalloc((1+oldcols)*sizeof(int),&smap);
97: ssmap = smap;
98: PetscMalloc((1+nrows)*sizeof(int),&lens);
99: ierr = PetscMemzero(smap,oldcols*sizeof(int));
100: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
101: /* determine lens of each row */
102: for (i=0; i<nrows; i++) {
103: kstart = ai[irow[i]];
104: kend = kstart + a->ilen[irow[i]];
105: lens[i] = 0;
106: for (k=kstart; k<kend; k++) {
107: if (ssmap[aj[k]]) {
108: lens[i]++;
109: }
110: }
111: }
112: /* Create and fill new matrix */
113: if (scall == MAT_REUSE_MATRIX) {
114: c = (Mat_SeqBAIJ *)((*B)->data);
116: if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
117: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
118: if (flag == PETSC_FALSE) {
119: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120: }
121: PetscMemzero(c->ilen,c->mbs*sizeof(int));
122: C = *B;
123: } else {
124: MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
125: }
126: c = (Mat_SeqBAIJ *)(C->data);
127: for (i=0; i<nrows; i++) {
128: row = irow[i];
129: kstart = ai[row];
130: kend = kstart + a->ilen[row];
131: mat_i = c->i[i];
132: mat_j = c->j + mat_i;
133: mat_a = c->a + mat_i*bs2;
134: mat_ilen = c->ilen + i;
135: for (k=kstart; k<kend; k++) {
136: if ((tcol=ssmap[a->j[k]])) {
137: *mat_j++ = tcol - 1;
138: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
139: mat_a += bs2;
140: (*mat_ilen)++;
141: }
142: }
143: }
144:
145: /* Free work space */
146: ISRestoreIndices(iscol,&icol);
147: PetscFree(smap);
148: PetscFree(lens);
149: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
150: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
151:
152: ISRestoreIndices(isrow,&irow);
153: *B = C;
154: return(0);
155: }
157: #undef __FUNCT__
159: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
160: {
161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
162: IS is1,is2;
163: int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
166: ISGetIndices(isrow,&irow);
167: ISGetIndices(iscol,&icol);
168: ISGetLocalSize(isrow,&nrows);
169: ISGetLocalSize(iscol,&ncols);
170:
171: /* Verify if the indices corespond to each element in a block
172: and form the IS with compressed IS */
173: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
174: iary = vary + a->mbs;
175: PetscMemzero(vary,(a->mbs)*sizeof(int));
176: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
177: count = 0;
178: for (i=0; i<a->mbs; i++) {
179: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
180: if (vary[i]==bs) iary[count++] = i;
181: }
182: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
183:
184: PetscMemzero(vary,(a->mbs)*sizeof(int));
185: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
186: count = 0;
187: for (i=0; i<a->mbs; i++) {
188: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
189: if (vary[i]==bs) iary[count++] = i;
190: }
191: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
192: ISRestoreIndices(isrow,&irow);
193: ISRestoreIndices(iscol,&icol);
194: PetscFree(vary);
196: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
197: ISDestroy(is1);
198: ISDestroy(is2);
199: return(0);
200: }
202: #undef __FUNCT__
204: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
205: {
206: int ierr,i;
209: if (scall == MAT_INITIAL_MATRIX) {
210: PetscMalloc((n+1)*sizeof(Mat),B);
211: }
213: for (i=0; i<n; i++) {
214: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
215: }
216: return(0);
217: }
220: /* -------------------------------------------------------*/
221: /* Should check that shapes of vectors and matrices match */
222: /* -------------------------------------------------------*/
223: #include petscblaslapack.h
225: #undef __FUNCT__
227: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
228: {
229: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
230: PetscScalar *x,*z,sum;
231: MatScalar *v;
232: int mbs=a->mbs,i,*idx,*ii,n,ierr;
235: VecGetArray(xx,&x);
236: VecGetArray(zz,&z);
238: idx = a->j;
239: v = a->a;
240: ii = a->i;
242: for (i=0; i<mbs; i++) {
243: n = ii[1] - ii[0]; ii++;
244: sum = 0.0;
245: while (n--) sum += *v++ * x[*idx++];
246: z[i] = sum;
247: }
248: VecRestoreArray(xx,&x);
249: VecRestoreArray(zz,&z);
250: PetscLogFlops(2*a->nz - A->m);
251: return(0);
252: }
254: #undef __FUNCT__
256: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
257: {
258: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
259: PetscScalar *x,*z,*xb,sum1,sum2;
260: PetscScalar x1,x2;
261: MatScalar *v;
262: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
265: VecGetArray(xx,&x);
266: VecGetArray(zz,&z);
268: idx = a->j;
269: v = a->a;
270: ii = a->i;
272: for (i=0; i<mbs; i++) {
273: n = ii[1] - ii[0]; ii++;
274: sum1 = 0.0; sum2 = 0.0;
275: for (j=0; j<n; j++) {
276: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
277: sum1 += v[0]*x1 + v[2]*x2;
278: sum2 += v[1]*x1 + v[3]*x2;
279: v += 4;
280: }
281: z[0] = sum1; z[1] = sum2;
282: z += 2;
283: }
284: VecRestoreArray(xx,&x);
285: VecRestoreArray(zz,&z);
286: PetscLogFlops(8*a->nz - A->m);
287: return(0);
288: }
290: #undef __FUNCT__
292: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
293: {
294: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
295: PetscScalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
296: MatScalar *v;
297: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
299: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
300: #pragma disjoint(*v,*z,*xb)
301: #endif
304: VecGetArray(xx,&x);
305: VecGetArray(zz,&z);
307: idx = a->j;
308: v = a->a;
309: ii = a->i;
311: for (i=0; i<mbs; i++) {
312: n = ii[1] - ii[0]; ii++;
313: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
314: for (j=0; j<n; j++) {
315: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
316: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
317: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
318: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
319: v += 9;
320: }
321: z[0] = sum1; z[1] = sum2; z[2] = sum3;
322: z += 3;
323: }
324: VecRestoreArray(xx,&x);
325: VecRestoreArray(zz,&z);
326: PetscLogFlops(18*a->nz - A->m);
327: return(0);
328: }
330: #undef __FUNCT__
332: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
333: {
334: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
335: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
336: MatScalar *v;
337: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
340: VecGetArray(xx,&x);
341: VecGetArray(zz,&z);
343: idx = a->j;
344: v = a->a;
345: ii = a->i;
347: for (i=0; i<mbs; i++) {
348: n = ii[1] - ii[0]; ii++;
349: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
350: for (j=0; j<n; j++) {
351: xb = x + 4*(*idx++);
352: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
353: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
354: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
355: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
356: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
357: v += 16;
358: }
359: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
360: z += 4;
361: }
362: VecRestoreArray(xx,&x);
363: VecRestoreArray(zz,&z);
364: PetscLogFlops(32*a->nz - A->m);
365: return(0);
366: }
368: #undef __FUNCT__
370: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
371: {
372: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
373: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
374: MatScalar *v;
375: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
378: VecGetArray(xx,&x);
379: VecGetArray(zz,&z);
381: idx = a->j;
382: v = a->a;
383: ii = a->i;
385: for (i=0; i<mbs; i++) {
386: n = ii[1] - ii[0]; ii++;
387: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
388: for (j=0; j<n; j++) {
389: xb = x + 5*(*idx++);
390: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
391: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
392: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
393: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
394: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
395: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
396: v += 25;
397: }
398: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
399: z += 5;
400: }
401: VecRestoreArray(xx,&x);
402: VecRestoreArray(zz,&z);
403: PetscLogFlops(50*a->nz - A->m);
404: return(0);
405: }
408: #undef __FUNCT__
410: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
411: {
412: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
413: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
414: PetscScalar x1,x2,x3,x4,x5,x6;
415: MatScalar *v;
416: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
419: VecGetArray(xx,&x);
420: VecGetArray(zz,&z);
422: idx = a->j;
423: v = a->a;
424: ii = a->i;
426: for (i=0; i<mbs; i++) {
427: n = ii[1] - ii[0]; ii++;
428: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
429: for (j=0; j<n; j++) {
430: xb = x + 6*(*idx++);
431: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
432: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
433: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
434: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
435: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
436: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
437: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
438: v += 36;
439: }
440: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
441: z += 6;
442: }
444: VecRestoreArray(xx,&x);
445: VecRestoreArray(zz,&z);
446: PetscLogFlops(72*a->nz - A->m);
447: return(0);
448: }
449: #undef __FUNCT__
451: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
452: {
453: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
454: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
455: PetscScalar x1,x2,x3,x4,x5,x6,x7;
456: MatScalar *v;
457: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
460: VecGetArray(xx,&x);
461: VecGetArray(zz,&z);
463: idx = a->j;
464: v = a->a;
465: ii = a->i;
467: for (i=0; i<mbs; i++) {
468: n = ii[1] - ii[0]; ii++;
469: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
470: for (j=0; j<n; j++) {
471: xb = x + 7*(*idx++);
472: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
473: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
474: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
475: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
476: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
477: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
478: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
479: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
480: v += 49;
481: }
482: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
483: z += 7;
484: }
486: VecRestoreArray(xx,&x);
487: VecRestoreArray(zz,&z);
488: PetscLogFlops(98*a->nz - A->m);
489: return(0);
490: }
492: /*
493: This will not work with MatScalar == float because it calls the BLAS
494: */
495: #undef __FUNCT__
497: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
498: {
499: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
500: PetscScalar *x,*z,*xb,*work,*workt;
501: MatScalar *v;
502: int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
503: int ncols,k;
506: VecGetArray(xx,&x);
507: VecGetArray(zz,&z);
509: idx = a->j;
510: v = a->a;
511: ii = a->i;
514: if (!a->mult_work) {
515: k = PetscMax(A->m,A->n);
516: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
517: }
518: work = a->mult_work;
519: for (i=0; i<mbs; i++) {
520: n = ii[1] - ii[0]; ii++;
521: ncols = n*bs;
522: workt = work;
523: for (j=0; j<n; j++) {
524: xb = x + bs*(*idx++);
525: for (k=0; k<bs; k++) workt[k] = xb[k];
526: workt += bs;
527: }
528: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
529: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
530: v += n*bs2;
531: z += bs;
532: }
533: VecRestoreArray(xx,&x);
534: VecRestoreArray(zz,&z);
535: PetscLogFlops(2*a->nz*bs2 - A->m);
536: return(0);
537: }
539: #undef __FUNCT__
541: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
542: {
543: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
544: PetscScalar *x,*y,*z,sum;
545: MatScalar *v;
546: int ierr,mbs=a->mbs,i,*idx,*ii,n;
549: VecGetArray(xx,&x);
550: VecGetArray(yy,&y);
551: if (zz != yy) {
552: VecGetArray(zz,&z);
553: } else {
554: z = y;
555: }
557: idx = a->j;
558: v = a->a;
559: ii = a->i;
561: for (i=0; i<mbs; i++) {
562: n = ii[1] - ii[0]; ii++;
563: sum = y[i];
564: while (n--) sum += *v++ * x[*idx++];
565: z[i] = sum;
566: }
567: VecRestoreArray(xx,&x);
568: VecRestoreArray(yy,&y);
569: if (zz != yy) {
570: VecRestoreArray(zz,&z);
571: }
572: PetscLogFlops(2*a->nz);
573: return(0);
574: }
576: #undef __FUNCT__
578: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
579: {
580: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
581: PetscScalar *x,*y,*z,*xb,sum1,sum2;
582: PetscScalar x1,x2;
583: MatScalar *v;
584: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
587: VecGetArray(xx,&x);
588: VecGetArray(yy,&y);
589: if (zz != yy) {
590: VecGetArray(zz,&z);
591: } else {
592: z = y;
593: }
595: idx = a->j;
596: v = a->a;
597: ii = a->i;
599: for (i=0; i<mbs; i++) {
600: n = ii[1] - ii[0]; ii++;
601: sum1 = y[0]; sum2 = y[1];
602: for (j=0; j<n; j++) {
603: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
604: sum1 += v[0]*x1 + v[2]*x2;
605: sum2 += v[1]*x1 + v[3]*x2;
606: v += 4;
607: }
608: z[0] = sum1; z[1] = sum2;
609: z += 2; y += 2;
610: }
611: VecRestoreArray(xx,&x);
612: VecRestoreArray(yy,&y);
613: if (zz != yy) {
614: VecRestoreArray(zz,&z);
615: }
616: PetscLogFlops(4*a->nz);
617: return(0);
618: }
620: #undef __FUNCT__
622: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
623: {
624: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
625: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
626: MatScalar *v;
627: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
630: VecGetArray(xx,&x);
631: VecGetArray(yy,&y);
632: if (zz != yy) {
633: VecGetArray(zz,&z);
634: } else {
635: z = y;
636: }
638: idx = a->j;
639: v = a->a;
640: ii = a->i;
642: for (i=0; i<mbs; i++) {
643: n = ii[1] - ii[0]; ii++;
644: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
645: for (j=0; j<n; j++) {
646: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
647: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
648: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
649: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
650: v += 9;
651: }
652: z[0] = sum1; z[1] = sum2; z[2] = sum3;
653: z += 3; y += 3;
654: }
655: VecRestoreArray(xx,&x);
656: VecRestoreArray(yy,&y);
657: if (zz != yy) {
658: VecRestoreArray(zz,&z);
659: }
660: PetscLogFlops(18*a->nz);
661: return(0);
662: }
664: #undef __FUNCT__
666: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
667: {
668: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
669: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
670: MatScalar *v;
671: int ierr,mbs=a->mbs,i,*idx,*ii;
672: int j,n;
675: VecGetArray(xx,&x);
676: VecGetArray(yy,&y);
677: if (zz != yy) {
678: VecGetArray(zz,&z);
679: } else {
680: z = y;
681: }
683: idx = a->j;
684: v = a->a;
685: ii = a->i;
687: for (i=0; i<mbs; i++) {
688: n = ii[1] - ii[0]; ii++;
689: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
690: for (j=0; j<n; j++) {
691: xb = x + 4*(*idx++);
692: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
693: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
694: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
695: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
696: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
697: v += 16;
698: }
699: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
700: z += 4; y += 4;
701: }
702: VecRestoreArray(xx,&x);
703: VecRestoreArray(yy,&y);
704: if (zz != yy) {
705: VecRestoreArray(zz,&z);
706: }
707: PetscLogFlops(32*a->nz);
708: return(0);
709: }
711: #undef __FUNCT__
713: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
714: {
715: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
716: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
717: MatScalar *v;
718: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
721: VecGetArray(xx,&x);
722: VecGetArray(yy,&y);
723: if (zz != yy) {
724: VecGetArray(zz,&z);
725: } else {
726: z = y;
727: }
729: idx = a->j;
730: v = a->a;
731: ii = a->i;
733: for (i=0; i<mbs; i++) {
734: n = ii[1] - ii[0]; ii++;
735: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
736: for (j=0; j<n; j++) {
737: xb = x + 5*(*idx++);
738: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
739: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
740: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
741: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
742: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
743: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
744: v += 25;
745: }
746: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
747: z += 5; y += 5;
748: }
749: VecRestoreArray(xx,&x);
750: VecRestoreArray(yy,&y);
751: if (zz != yy) {
752: VecRestoreArray(zz,&z);
753: }
754: PetscLogFlops(50*a->nz);
755: return(0);
756: }
757: #undef __FUNCT__
759: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
760: {
761: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
762: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
763: PetscScalar x1,x2,x3,x4,x5,x6;
764: MatScalar *v;
765: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
768: VecGetArray(xx,&x);
769: VecGetArray(yy,&y);
770: if (zz != yy) {
771: VecGetArray(zz,&z);
772: } else {
773: z = y;
774: }
776: idx = a->j;
777: v = a->a;
778: ii = a->i;
780: for (i=0; i<mbs; i++) {
781: n = ii[1] - ii[0]; ii++;
782: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
783: for (j=0; j<n; j++) {
784: xb = x + 6*(*idx++);
785: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
786: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
787: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
788: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
789: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
790: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
791: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
792: v += 36;
793: }
794: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
795: z += 6; y += 6;
796: }
797: VecRestoreArray(xx,&x);
798: VecRestoreArray(yy,&y);
799: if (zz != yy) {
800: VecRestoreArray(zz,&z);
801: }
802: PetscLogFlops(72*a->nz);
803: return(0);
804: }
806: #undef __FUNCT__
808: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
809: {
810: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
811: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
812: PetscScalar x1,x2,x3,x4,x5,x6,x7;
813: MatScalar *v;
814: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
817: VecGetArray(xx,&x);
818: VecGetArray(yy,&y);
819: if (zz != yy) {
820: VecGetArray(zz,&z);
821: } else {
822: z = y;
823: }
825: idx = a->j;
826: v = a->a;
827: ii = a->i;
829: for (i=0; i<mbs; i++) {
830: n = ii[1] - ii[0]; ii++;
831: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
832: for (j=0; j<n; j++) {
833: xb = x + 7*(*idx++);
834: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
835: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
836: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
837: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
838: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
839: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
840: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
841: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
842: v += 49;
843: }
844: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
845: z += 7; y += 7;
846: }
847: VecRestoreArray(xx,&x);
848: VecRestoreArray(yy,&y);
849: if (zz != yy) {
850: VecRestoreArray(zz,&z);
851: }
852: PetscLogFlops(98*a->nz);
853: return(0);
854: }
856: #undef __FUNCT__
858: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
859: {
860: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
861: PetscScalar *x,*z,*xb,*work,*workt,*y;
862: MatScalar *v;
863: int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
864: int ncols,k;
867: VecGetArray(xx,&x);
868: VecGetArray(zz,&z);
869: if (zz != yy) {
870: VecGetArrayFast(yy,&y);
871: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
872: VecRestoreArrayFast(yy,&y);
873: }
875: idx = a->j;
876: v = a->a;
877: ii = a->i;
880: if (!a->mult_work) {
881: k = PetscMax(A->m,A->n);
882: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
883: }
884: work = a->mult_work;
885: for (i=0; i<mbs; i++) {
886: n = ii[1] - ii[0]; ii++;
887: ncols = n*bs;
888: workt = work;
889: for (j=0; j<n; j++) {
890: xb = x + bs*(*idx++);
891: for (k=0; k<bs; k++) workt[k] = xb[k];
892: workt += bs;
893: }
894: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
895: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
896: v += n*bs2;
897: z += bs;
898: }
899: VecRestoreArray(xx,&x);
900: VecRestoreArray(zz,&z);
901: PetscLogFlops(2*a->nz*bs2);
902: return(0);
903: }
905: #undef __FUNCT__
907: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
908: {
909: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
910: PetscScalar *xg,*zg,*zb,zero = 0.0;
911: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
912: MatScalar *v;
913: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
914: int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
917: VecSet(&zero,zz);
918: VecGetArray(xx,&xg); x = xg;
919: VecGetArray(zz,&zg); z = zg;
921: idx = a->j;
922: v = a->a;
923: ii = a->i;
924: xb = x;
925: switch (bs) {
926: case 1:
927: for (i=0; i<mbs; i++) {
928: n = ii[1] - ii[0]; ii++;
929: x1 = xb[0];
930: ib = idx + ai[i];
931: for (j=0; j<n; j++) {
932: rval = ib[j];
933: z[rval] += *v * x1;
934: v++;
935: }
936: xb++;
937: }
938: break;
939: case 2:
940: for (i=0; i<mbs; i++) {
941: n = ii[1] - ii[0]; ii++;
942: x1 = xb[0]; x2 = xb[1];
943: ib = idx + ai[i];
944: for (j=0; j<n; j++) {
945: rval = ib[j]*2;
946: z[rval++] += v[0]*x1 + v[1]*x2;
947: z[rval] += v[2]*x1 + v[3]*x2;
948: v += 4;
949: }
950: xb += 2;
951: }
952: break;
953: case 3:
954: for (i=0; i<mbs; i++) {
955: n = ii[1] - ii[0]; ii++;
956: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
957: ib = idx + ai[i];
958: for (j=0; j<n; j++) {
959: rval = ib[j]*3;
960: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
961: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
962: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
963: v += 9;
964: }
965: xb += 3;
966: }
967: break;
968: case 4:
969: for (i=0; i<mbs; i++) {
970: n = ii[1] - ii[0]; ii++;
971: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
972: ib = idx + ai[i];
973: for (j=0; j<n; j++) {
974: rval = ib[j]*4;
975: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
976: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
977: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
978: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
979: v += 16;
980: }
981: xb += 4;
982: }
983: break;
984: case 5:
985: for (i=0; i<mbs; i++) {
986: n = ii[1] - ii[0]; ii++;
987: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
988: x4 = xb[3]; x5 = xb[4];
989: ib = idx + ai[i];
990: for (j=0; j<n; j++) {
991: rval = ib[j]*5;
992: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
993: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
994: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
995: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
996: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
997: v += 25;
998: }
999: xb += 5;
1000: }
1001: break;
1002: case 6:
1003: for (i=0; i<mbs; i++) {
1004: n = ii[1] - ii[0]; ii++;
1005: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1006: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1007: ib = idx + ai[i];
1008: for (j=0; j<n; j++) {
1009: rval = ib[j]*6;
1010: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1011: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1012: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1013: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1014: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1015: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1016: v += 36;
1017: }
1018: xb += 6;
1019: }
1020: break;
1021: case 7:
1022: for (i=0; i<mbs; i++) {
1023: n = ii[1] - ii[0]; ii++;
1024: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1025: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1026: ib = idx + ai[i];
1027: for (j=0; j<n; j++) {
1028: rval = ib[j]*7;
1029: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1030: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1031: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1032: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1033: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1034: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1035: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1036: v += 49;
1037: }
1038: xb += 7;
1039: }
1040: break;
1041: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
1042: int ncols,k;
1043: PetscScalar *work,*workt;
1045: if (!a->mult_work) {
1046: k = PetscMax(A->m,A->n);
1047: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1048: }
1049: work = a->mult_work;
1050: for (i=0; i<mbs; i++) {
1051: n = ii[1] - ii[0]; ii++;
1052: ncols = n*bs;
1053: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1054: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1055: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1056: v += n*bs2;
1057: x += bs;
1058: workt = work;
1059: for (j=0; j<n; j++) {
1060: zb = z + bs*(*idx++);
1061: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1062: workt += bs;
1063: }
1064: }
1065: }
1066: }
1067: VecRestoreArray(xx,&xg);
1068: VecRestoreArray(zz,&zg);
1069: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1070: return(0);
1071: }
1073: #undef __FUNCT__
1075: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1077: {
1078: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1079: PetscScalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1080: MatScalar *v;
1081: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1084: if (yy != zz) { VecCopy(yy,zz); }
1085: VecGetArray(xx,&xg); x = xg;
1086: VecGetArray(zz,&zg); z = zg;
1089: idx = a->j;
1090: v = a->a;
1091: ii = a->i;
1092: xb = x;
1094: switch (bs) {
1095: case 1:
1096: for (i=0; i<mbs; i++) {
1097: n = ii[1] - ii[0]; ii++;
1098: x1 = xb[0];
1099: ib = idx + ai[i];
1100: for (j=0; j<n; j++) {
1101: rval = ib[j];
1102: z[rval] += *v * x1;
1103: v++;
1104: }
1105: xb++;
1106: }
1107: break;
1108: case 2:
1109: for (i=0; i<mbs; i++) {
1110: n = ii[1] - ii[0]; ii++;
1111: x1 = xb[0]; x2 = xb[1];
1112: ib = idx + ai[i];
1113: for (j=0; j<n; j++) {
1114: rval = ib[j]*2;
1115: z[rval++] += v[0]*x1 + v[1]*x2;
1116: z[rval++] += v[2]*x1 + v[3]*x2;
1117: v += 4;
1118: }
1119: xb += 2;
1120: }
1121: break;
1122: case 3:
1123: for (i=0; i<mbs; i++) {
1124: n = ii[1] - ii[0]; ii++;
1125: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1126: ib = idx + ai[i];
1127: for (j=0; j<n; j++) {
1128: rval = ib[j]*3;
1129: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1130: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1131: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1132: v += 9;
1133: }
1134: xb += 3;
1135: }
1136: break;
1137: case 4:
1138: for (i=0; i<mbs; i++) {
1139: n = ii[1] - ii[0]; ii++;
1140: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1141: ib = idx + ai[i];
1142: for (j=0; j<n; j++) {
1143: rval = ib[j]*4;
1144: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1145: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1146: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1147: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1148: v += 16;
1149: }
1150: xb += 4;
1151: }
1152: break;
1153: case 5:
1154: for (i=0; i<mbs; i++) {
1155: n = ii[1] - ii[0]; ii++;
1156: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1157: x4 = xb[3]; x5 = xb[4];
1158: ib = idx + ai[i];
1159: for (j=0; j<n; j++) {
1160: rval = ib[j]*5;
1161: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1162: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1163: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1164: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1165: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1166: v += 25;
1167: }
1168: xb += 5;
1169: }
1170: break;
1171: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1172: int ncols,k;
1173: PetscScalar *work,*workt;
1175: if (!a->mult_work) {
1176: k = PetscMax(A->m,A->n);
1177: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1178: }
1179: work = a->mult_work;
1180: for (i=0; i<mbs; i++) {
1181: n = ii[1] - ii[0]; ii++;
1182: ncols = n*bs;
1183: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1184: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1185: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1186: v += n*bs2;
1187: x += bs;
1188: workt = work;
1189: for (j=0; j<n; j++) {
1190: zb = z + bs*(*idx++);
1191: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1192: workt += bs;
1193: }
1194: }
1195: }
1196: }
1197: VecRestoreArray(xx,&xg);
1198: VecRestoreArray(zz,&zg);
1199: PetscLogFlops(2*a->nz*a->bs2);
1200: return(0);
1201: }
1203: #undef __FUNCT__
1205: int MatScale_SeqBAIJ(PetscScalar *alpha,Mat inA)
1206: {
1207: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1208: int totalnz = a->bs2*a->nz;
1209: #if defined(PETSC_USE_MAT_SINGLE)
1210: int i;
1211: #else
1212: int one = 1;
1213: #endif
1216: #if defined(PETSC_USE_MAT_SINGLE)
1217: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1218: #else
1219: BLscal_(&totalnz,alpha,a->a,&one);
1220: #endif
1221: PetscLogFlops(totalnz);
1222: return(0);
1223: }
1225: #undef __FUNCT__
1227: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1228: {
1229: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1230: MatScalar *v = a->a;
1231: PetscReal sum = 0.0;
1232: int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1235: if (type == NORM_FROBENIUS) {
1236: for (i=0; i< bs2*nz; i++) {
1237: #if defined(PETSC_USE_COMPLEX)
1238: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1239: #else
1240: sum += (*v)*(*v); v++;
1241: #endif
1242: }
1243: *norm = sqrt(sum);
1244: } else if (type == NORM_INFINITY) { /* maximum row sum */
1245: *norm = 0.0;
1246: for (k=0; k<bs; k++) {
1247: for (j=0; j<a->mbs; j++) {
1248: v = a->a + bs2*a->i[j] + k;
1249: sum = 0.0;
1250: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1251: for (k1=0; k1<bs; k1++){
1252: sum += PetscAbsScalar(*v);
1253: v += bs;
1254: }
1255: }
1256: if (sum > *norm) *norm = sum;
1257: }
1258: }
1259: } else {
1260: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1261: }
1262: return(0);
1263: }
1266: #undef __FUNCT__
1268: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1269: {
1270: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1271: int ierr;
1272: PetscTruth flag;
1275: PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1276: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1278: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1279: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1280: *flg = PETSC_FALSE;
1281: return(0);
1282: }
1283:
1284: /* if the a->i are the same */
1285: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1286: if (*flg == PETSC_FALSE) {
1287: return(0);
1288: }
1289:
1290: /* if a->j are the same */
1291: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1292: if (*flg == PETSC_FALSE) {
1293: return(0);
1294: }
1295: /* if a->a are the same */
1296: PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1297: return(0);
1298:
1299: }
1301: #undef __FUNCT__
1303: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1304: {
1305: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1306: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1307: PetscScalar *x,zero = 0.0;
1308: MatScalar *aa,*aa_j;
1311: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1312: bs = a->bs;
1313: aa = a->a;
1314: ai = a->i;
1315: aj = a->j;
1316: ambs = a->mbs;
1317: bs2 = a->bs2;
1319: VecSet(&zero,v);
1320: VecGetArray(v,&x);
1321: VecGetLocalSize(v,&n);
1322: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1323: for (i=0; i<ambs; i++) {
1324: for (j=ai[i]; j<ai[i+1]; j++) {
1325: if (aj[j] == i) {
1326: row = i*bs;
1327: aa_j = aa+j*bs2;
1328: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1329: break;
1330: }
1331: }
1332: }
1333: VecRestoreArray(v,&x);
1334: return(0);
1335: }
1337: #undef __FUNCT__
1339: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1340: {
1341: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1342: PetscScalar *l,*r,x,*li,*ri;
1343: MatScalar *aa,*v;
1344: int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1347: ai = a->i;
1348: aj = a->j;
1349: aa = a->a;
1350: m = A->m;
1351: n = A->n;
1352: bs = a->bs;
1353: mbs = a->mbs;
1354: bs2 = a->bs2;
1355: if (ll) {
1356: VecGetArray(ll,&l);
1357: VecGetLocalSize(ll,&lm);
1358: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1359: for (i=0; i<mbs; i++) { /* for each block row */
1360: M = ai[i+1] - ai[i];
1361: li = l + i*bs;
1362: v = aa + bs2*ai[i];
1363: for (j=0; j<M; j++) { /* for each block */
1364: for (k=0; k<bs2; k++) {
1365: (*v++) *= li[k%bs];
1366: }
1367: }
1368: }
1369: VecRestoreArray(ll,&l);
1370: PetscLogFlops(a->nz);
1371: }
1372:
1373: if (rr) {
1374: VecGetArray(rr,&r);
1375: VecGetLocalSize(rr,&rn);
1376: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1377: for (i=0; i<mbs; i++) { /* for each block row */
1378: M = ai[i+1] - ai[i];
1379: v = aa + bs2*ai[i];
1380: for (j=0; j<M; j++) { /* for each block */
1381: ri = r + bs*aj[ai[i]+j];
1382: for (k=0; k<bs; k++) {
1383: x = ri[k];
1384: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1385: }
1386: }
1387: }
1388: VecRestoreArray(rr,&r);
1389: PetscLogFlops(a->nz);
1390: }
1391: return(0);
1392: }
1395: #undef __FUNCT__
1397: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1398: {
1399: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1402: info->rows_global = (double)A->m;
1403: info->columns_global = (double)A->n;
1404: info->rows_local = (double)A->m;
1405: info->columns_local = (double)A->n;
1406: info->block_size = a->bs2;
1407: info->nz_allocated = a->maxnz;
1408: info->nz_used = a->bs2*a->nz;
1409: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1410: info->assemblies = A->num_ass;
1411: info->mallocs = a->reallocs;
1412: info->memory = A->mem;
1413: if (A->factor) {
1414: info->fill_ratio_given = A->info.fill_ratio_given;
1415: info->fill_ratio_needed = A->info.fill_ratio_needed;
1416: info->factor_mallocs = A->info.factor_mallocs;
1417: } else {
1418: info->fill_ratio_given = 0;
1419: info->fill_ratio_needed = 0;
1420: info->factor_mallocs = 0;
1421: }
1422: return(0);
1423: }
1426: #undef __FUNCT__
1428: int MatZeroEntries_SeqBAIJ(Mat A)
1429: {
1430: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1431: int ierr;
1434: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1435: return(0);
1436: }