Actual source code: baij2.c
2: #include src/mat/impls/baij/seq/baij.h
3: #include src/inline/spops.h
4: #include src/inline/ilu.h
5: #include petscbt.h
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
14: PetscInt start,end,*ai,*aj,bs,*nidx2;
15: PetscBT table;
18: m = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = A->bs;
23: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: PetscBTCreate(m,table);
26: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
27: PetscMalloc((A->m+1)*sizeof(PetscInt),&nidx2);
29: for (i=0; i<is_max; i++) {
30: /* Initialise the two local arrays */
31: isz = 0;
32: PetscBTMemzero(m,table);
33:
34: /* Extract the indices, assume there can be duplicate entries */
35: ISGetIndices(is[i],&idx);
36: ISGetLocalSize(is[i],&n);
38: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
39: for (j=0; j<n ; ++j){
40: ival = idx[j]/bs; /* convert the indices into block indices */
41: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
43: }
44: ISRestoreIndices(is[i],&idx);
45: ISDestroy(is[i]);
46:
47: k = 0;
48: for (j=0; j<ov; j++){ /* for each overlap*/
49: n = isz;
50: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
51: row = nidx[k];
52: start = ai[row];
53: end = ai[row+1];
54: for (l = start; l<end ; l++){
55: val = aj[l];
56: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
57: }
58: }
59: }
60: /* expand the Index Set */
61: for (j=0; j<isz; j++) {
62: for (k=0; k<bs; k++)
63: nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
66: }
67: PetscBTDestroy(table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: PetscInt *irow,*icol,nrows,ncols,*ssmap,bs=A->bs,bs2=a->bs2;
82: PetscInt *aj = a->j,*ai = a->i;
83: MatScalar *mat_a;
84: Mat C;
85: PetscTruth flag;
88: ISSorted(iscol,(PetscTruth*)&i);
89: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
91: ISGetIndices(isrow,&irow);
92: ISGetIndices(iscol,&icol);
93: ISGetLocalSize(isrow,&nrows);
94: ISGetLocalSize(iscol,&ncols);
96: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
97: ssmap = smap;
98: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
99: PetscMemzero(smap,oldcols*sizeof(PetscInt));
100: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
101: /* determine lens of each row */
102: for (i=0; i<nrows; i++) {
103: kstart = ai[irow[i]];
104: kend = kstart + a->ilen[irow[i]];
105: lens[i] = 0;
106: for (k=kstart; k<kend; k++) {
107: if (ssmap[aj[k]]) {
108: lens[i]++;
109: }
110: }
111: }
112: /* Create and fill new matrix */
113: if (scall == MAT_REUSE_MATRIX) {
114: c = (Mat_SeqBAIJ *)((*B)->data);
116: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
117: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
118: if (flag == PETSC_FALSE) {
119: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120: }
121: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
122: C = *B;
123: } else {
124: MatCreate(A->comm,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE,&C);
125: MatSetType(C,A->type_name);
126: MatSeqBAIJSetPreallocation(C,bs,0,lens);
127: }
128: c = (Mat_SeqBAIJ *)(C->data);
129: for (i=0; i<nrows; i++) {
130: row = irow[i];
131: kstart = ai[row];
132: kend = kstart + a->ilen[row];
133: mat_i = c->i[i];
134: mat_j = c->j + mat_i;
135: mat_a = c->a + mat_i*bs2;
136: mat_ilen = c->ilen + i;
137: for (k=kstart; k<kend; k++) {
138: if ((tcol=ssmap[a->j[k]])) {
139: *mat_j++ = tcol - 1;
140: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
141: mat_a += bs2;
142: (*mat_ilen)++;
143: }
144: }
145: }
146:
147: /* Free work space */
148: ISRestoreIndices(iscol,&icol);
149: PetscFree(smap);
150: PetscFree(lens);
151: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
152: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
153:
154: ISRestoreIndices(isrow,&irow);
155: *B = C;
156: return(0);
157: }
161: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
162: {
163: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
164: IS is1,is2;
166: PetscInt *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->bs,count;
169: ISGetIndices(isrow,&irow);
170: ISGetIndices(iscol,&icol);
171: ISGetLocalSize(isrow,&nrows);
172: ISGetLocalSize(iscol,&ncols);
173:
174: /* Verify if the indices corespond to each element in a block
175: and form the IS with compressed IS */
176: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
177: iary = vary + a->mbs;
178: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
179: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180: count = 0;
181: for (i=0; i<a->mbs; i++) {
182: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183: if (vary[i]==bs) iary[count++] = i;
184: }
185: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
186:
187: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
188: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189: count = 0;
190: for (i=0; i<a->mbs; i++) {
191: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
192: if (vary[i]==bs) iary[count++] = i;
193: }
194: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
195: ISRestoreIndices(isrow,&irow);
196: ISRestoreIndices(iscol,&icol);
197: PetscFree(vary);
199: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
200: ISDestroy(is1);
201: ISDestroy(is2);
202: return(0);
203: }
207: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208: {
210: PetscInt i;
213: if (scall == MAT_INITIAL_MATRIX) {
214: PetscMalloc((n+1)*sizeof(Mat),B);
215: }
217: for (i=0; i<n; i++) {
218: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
219: }
220: return(0);
221: }
224: /* -------------------------------------------------------*/
225: /* Should check that shapes of vectors and matrices match */
226: /* -------------------------------------------------------*/
227: #include petscblaslapack.h
231: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
232: {
233: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
234: PetscScalar *x,*z,sum;
235: MatScalar *v;
237: PetscInt mbs=a->mbs,i,*idx,*ii,n;
240: VecGetArray(xx,&x);
241: VecGetArray(zz,&z);
243: idx = a->j;
244: v = a->a;
245: ii = a->i;
247: for (i=0; i<mbs; i++) {
248: n = ii[1] - ii[0]; ii++;
249: sum = 0.0;
250: while (n--) sum += *v++ * x[*idx++];
251: z[i] = sum;
252: }
253: VecRestoreArray(xx,&x);
254: VecRestoreArray(zz,&z);
255: PetscLogFlops(2*a->nz - A->m);
256: return(0);
257: }
261: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
262: {
263: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
264: PetscScalar *x,*z,*xb,sum1,sum2;
265: PetscScalar x1,x2;
266: MatScalar *v;
268: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
271: VecGetArray(xx,&x);
272: VecGetArray(zz,&z);
274: idx = a->j;
275: v = a->a;
276: ii = a->i;
278: for (i=0; i<mbs; i++) {
279: n = ii[1] - ii[0]; ii++;
280: sum1 = 0.0; sum2 = 0.0;
281: for (j=0; j<n; j++) {
282: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
283: sum1 += v[0]*x1 + v[2]*x2;
284: sum2 += v[1]*x1 + v[3]*x2;
285: v += 4;
286: }
287: z[0] = sum1; z[1] = sum2;
288: z += 2;
289: }
290: VecRestoreArray(xx,&x);
291: VecRestoreArray(zz,&z);
292: PetscLogFlops(8*a->nz - A->m);
293: return(0);
294: }
298: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
299: {
300: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
301: PetscScalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
302: MatScalar *v;
304: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
306: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
307: #pragma disjoint(*v,*z,*xb)
308: #endif
311: VecGetArray(xx,&x);
312: VecGetArray(zz,&z);
314: idx = a->j;
315: v = a->a;
316: ii = a->i;
318: for (i=0; i<mbs; i++) {
319: n = ii[1] - ii[0]; ii++;
320: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
321: for (j=0; j<n; j++) {
322: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
323: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
324: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
325: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
326: v += 9;
327: }
328: z[0] = sum1; z[1] = sum2; z[2] = sum3;
329: z += 3;
330: }
331: VecRestoreArray(xx,&x);
332: VecRestoreArray(zz,&z);
333: PetscLogFlops(18*a->nz - A->m);
334: return(0);
335: }
339: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
340: {
341: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
342: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
343: MatScalar *v;
345: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
348: VecGetArray(xx,&x);
349: VecGetArray(zz,&z);
351: idx = a->j;
352: v = a->a;
353: ii = a->i;
355: for (i=0; i<mbs; i++) {
356: n = ii[1] - ii[0]; ii++;
357: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
358: for (j=0; j<n; j++) {
359: xb = x + 4*(*idx++);
360: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
361: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
362: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
363: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
364: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
365: v += 16;
366: }
367: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
368: z += 4;
369: }
370: VecRestoreArray(xx,&x);
371: VecRestoreArray(zz,&z);
372: PetscLogFlops(32*a->nz - A->m);
373: return(0);
374: }
378: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
379: {
380: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
381: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
382: MatScalar *v;
384: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
387: VecGetArray(xx,&x);
388: VecGetArray(zz,&z);
390: idx = a->j;
391: v = a->a;
392: ii = a->i;
394: for (i=0; i<mbs; i++) {
395: n = ii[1] - ii[0]; ii++;
396: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
397: for (j=0; j<n; j++) {
398: xb = x + 5*(*idx++);
399: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
400: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
401: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
402: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
403: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
404: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
405: v += 25;
406: }
407: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
408: z += 5;
409: }
410: VecRestoreArray(xx,&x);
411: VecRestoreArray(zz,&z);
412: PetscLogFlops(50*a->nz - A->m);
413: return(0);
414: }
419: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
420: {
421: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
422: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
423: PetscScalar x1,x2,x3,x4,x5,x6;
424: MatScalar *v;
426: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
429: VecGetArray(xx,&x);
430: VecGetArray(zz,&z);
432: idx = a->j;
433: v = a->a;
434: ii = a->i;
436: for (i=0; i<mbs; i++) {
437: n = ii[1] - ii[0]; ii++;
438: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
439: for (j=0; j<n; j++) {
440: xb = x + 6*(*idx++);
441: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
442: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
443: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
444: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
445: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
446: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
447: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
448: v += 36;
449: }
450: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
451: z += 6;
452: }
454: VecRestoreArray(xx,&x);
455: VecRestoreArray(zz,&z);
456: PetscLogFlops(72*a->nz - A->m);
457: return(0);
458: }
461: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
462: {
463: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
464: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
465: PetscScalar x1,x2,x3,x4,x5,x6,x7;
466: MatScalar *v;
468: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
471: VecGetArray(xx,&x);
472: VecGetArray(zz,&z);
474: idx = a->j;
475: v = a->a;
476: ii = a->i;
478: for (i=0; i<mbs; i++) {
479: n = ii[1] - ii[0]; ii++;
480: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
481: for (j=0; j<n; j++) {
482: xb = x + 7*(*idx++);
483: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
484: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
485: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
486: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
487: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
488: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
489: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
490: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
491: v += 49;
492: }
493: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
494: z += 7;
495: }
497: VecRestoreArray(xx,&x);
498: VecRestoreArray(zz,&z);
499: PetscLogFlops(98*a->nz - A->m);
500: return(0);
501: }
503: /*
504: This will not work with MatScalar == float because it calls the BLAS
505: */
508: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
509: {
510: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
511: PetscScalar *x,*z,*xb,*work,*workt;
512: MatScalar *v;
514: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
515: PetscInt ncols,k;
518: VecGetArray(xx,&x);
519: VecGetArray(zz,&z);
521: idx = a->j;
522: v = a->a;
523: ii = a->i;
526: if (!a->mult_work) {
527: k = PetscMax(A->m,A->n);
528: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
529: }
530: work = a->mult_work;
531: for (i=0; i<mbs; i++) {
532: n = ii[1] - ii[0]; ii++;
533: ncols = n*bs;
534: workt = work;
535: for (j=0; j<n; j++) {
536: xb = x + bs*(*idx++);
537: for (k=0; k<bs; k++) workt[k] = xb[k];
538: workt += bs;
539: }
540: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
541: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
542: v += n*bs2;
543: z += bs;
544: }
545: VecRestoreArray(xx,&x);
546: VecRestoreArray(zz,&z);
547: PetscLogFlops(2*a->nz*bs2 - A->m);
548: return(0);
549: }
553: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
554: {
555: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
556: PetscScalar *x,*y,*z,sum;
557: MatScalar *v;
559: PetscInt mbs=a->mbs,i,*idx,*ii,n;
562: VecGetArray(xx,&x);
563: VecGetArray(yy,&y);
564: if (zz != yy) {
565: VecGetArray(zz,&z);
566: } else {
567: z = y;
568: }
570: idx = a->j;
571: v = a->a;
572: ii = a->i;
574: for (i=0; i<mbs; i++) {
575: n = ii[1] - ii[0]; ii++;
576: sum = y[i];
577: while (n--) sum += *v++ * x[*idx++];
578: z[i] = sum;
579: }
580: VecRestoreArray(xx,&x);
581: VecRestoreArray(yy,&y);
582: if (zz != yy) {
583: VecRestoreArray(zz,&z);
584: }
585: PetscLogFlops(2*a->nz);
586: return(0);
587: }
591: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
592: {
593: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
594: PetscScalar *x,*y,*z,*xb,sum1,sum2;
595: PetscScalar x1,x2;
596: MatScalar *v;
598: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
601: VecGetArray(xx,&x);
602: VecGetArray(yy,&y);
603: if (zz != yy) {
604: VecGetArray(zz,&z);
605: } else {
606: z = y;
607: }
609: idx = a->j;
610: v = a->a;
611: ii = a->i;
613: for (i=0; i<mbs; i++) {
614: n = ii[1] - ii[0]; ii++;
615: sum1 = y[0]; sum2 = y[1];
616: for (j=0; j<n; j++) {
617: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
618: sum1 += v[0]*x1 + v[2]*x2;
619: sum2 += v[1]*x1 + v[3]*x2;
620: v += 4;
621: }
622: z[0] = sum1; z[1] = sum2;
623: z += 2; y += 2;
624: }
625: VecRestoreArray(xx,&x);
626: VecRestoreArray(yy,&y);
627: if (zz != yy) {
628: VecRestoreArray(zz,&z);
629: }
630: PetscLogFlops(4*a->nz);
631: return(0);
632: }
636: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
637: {
638: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
639: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
640: MatScalar *v;
642: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
645: VecGetArray(xx,&x);
646: VecGetArray(yy,&y);
647: if (zz != yy) {
648: VecGetArray(zz,&z);
649: } else {
650: z = y;
651: }
653: idx = a->j;
654: v = a->a;
655: ii = a->i;
657: for (i=0; i<mbs; i++) {
658: n = ii[1] - ii[0]; ii++;
659: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
660: for (j=0; j<n; j++) {
661: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
662: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
663: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
664: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
665: v += 9;
666: }
667: z[0] = sum1; z[1] = sum2; z[2] = sum3;
668: z += 3; y += 3;
669: }
670: VecRestoreArray(xx,&x);
671: VecRestoreArray(yy,&y);
672: if (zz != yy) {
673: VecRestoreArray(zz,&z);
674: }
675: PetscLogFlops(18*a->nz);
676: return(0);
677: }
681: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
682: {
683: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
684: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
685: MatScalar *v;
687: PetscInt mbs=a->mbs,i,*idx,*ii;
688: PetscInt j,n;
691: VecGetArray(xx,&x);
692: VecGetArray(yy,&y);
693: if (zz != yy) {
694: VecGetArray(zz,&z);
695: } else {
696: z = y;
697: }
699: idx = a->j;
700: v = a->a;
701: ii = a->i;
703: for (i=0; i<mbs; i++) {
704: n = ii[1] - ii[0]; ii++;
705: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
706: for (j=0; j<n; j++) {
707: xb = x + 4*(*idx++);
708: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
709: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
710: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
711: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
712: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
713: v += 16;
714: }
715: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
716: z += 4; y += 4;
717: }
718: VecRestoreArray(xx,&x);
719: VecRestoreArray(yy,&y);
720: if (zz != yy) {
721: VecRestoreArray(zz,&z);
722: }
723: PetscLogFlops(32*a->nz);
724: return(0);
725: }
729: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
730: {
731: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
732: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
733: MatScalar *v;
735: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
738: VecGetArray(xx,&x);
739: VecGetArray(yy,&y);
740: if (zz != yy) {
741: VecGetArray(zz,&z);
742: } else {
743: z = y;
744: }
746: idx = a->j;
747: v = a->a;
748: ii = a->i;
750: for (i=0; i<mbs; i++) {
751: n = ii[1] - ii[0]; ii++;
752: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
753: for (j=0; j<n; j++) {
754: xb = x + 5*(*idx++);
755: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
756: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
757: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
758: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
759: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
760: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
761: v += 25;
762: }
763: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
764: z += 5; y += 5;
765: }
766: VecRestoreArray(xx,&x);
767: VecRestoreArray(yy,&y);
768: if (zz != yy) {
769: VecRestoreArray(zz,&z);
770: }
771: PetscLogFlops(50*a->nz);
772: return(0);
773: }
776: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
777: {
778: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
779: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
780: PetscScalar x1,x2,x3,x4,x5,x6;
781: MatScalar *v;
783: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
786: VecGetArray(xx,&x);
787: VecGetArray(yy,&y);
788: if (zz != yy) {
789: VecGetArray(zz,&z);
790: } else {
791: z = y;
792: }
794: idx = a->j;
795: v = a->a;
796: ii = a->i;
798: for (i=0; i<mbs; i++) {
799: n = ii[1] - ii[0]; ii++;
800: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
801: for (j=0; j<n; j++) {
802: xb = x + 6*(*idx++);
803: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
804: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
805: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
806: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
807: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
808: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
809: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
810: v += 36;
811: }
812: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
813: z += 6; y += 6;
814: }
815: VecRestoreArray(xx,&x);
816: VecRestoreArray(yy,&y);
817: if (zz != yy) {
818: VecRestoreArray(zz,&z);
819: }
820: PetscLogFlops(72*a->nz);
821: return(0);
822: }
826: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
827: {
828: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
829: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
830: PetscScalar x1,x2,x3,x4,x5,x6,x7;
831: MatScalar *v;
833: PetscInt mbs=a->mbs,i,*idx,*ii,j,n;
836: VecGetArray(xx,&x);
837: VecGetArray(yy,&y);
838: if (zz != yy) {
839: VecGetArray(zz,&z);
840: } else {
841: z = y;
842: }
844: idx = a->j;
845: v = a->a;
846: ii = a->i;
848: for (i=0; i<mbs; i++) {
849: n = ii[1] - ii[0]; ii++;
850: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
851: for (j=0; j<n; j++) {
852: xb = x + 7*(*idx++);
853: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
854: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
855: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
856: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
857: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
858: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
859: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
860: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
861: v += 49;
862: }
863: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
864: z += 7; y += 7;
865: }
866: VecRestoreArray(xx,&x);
867: VecRestoreArray(yy,&y);
868: if (zz != yy) {
869: VecRestoreArray(zz,&z);
870: }
871: PetscLogFlops(98*a->nz);
872: return(0);
873: }
877: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
878: {
879: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
880: PetscScalar *x,*z,*xb,*work,*workt,*y;
881: MatScalar *v;
883: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->bs,j,n,bs2=a->bs2;
884: PetscInt ncols,k;
887: VecGetArray(xx,&x);
888: VecGetArray(zz,&z);
889: if (zz != yy) {
890: VecGetArray(yy,&y);
891: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
892: VecRestoreArray(yy,&y);
893: }
895: idx = a->j;
896: v = a->a;
897: ii = a->i;
900: if (!a->mult_work) {
901: k = PetscMax(A->m,A->n);
902: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
903: }
904: work = a->mult_work;
905: for (i=0; i<mbs; i++) {
906: n = ii[1] - ii[0]; ii++;
907: ncols = n*bs;
908: workt = work;
909: for (j=0; j<n; j++) {
910: xb = x + bs*(*idx++);
911: for (k=0; k<bs; k++) workt[k] = xb[k];
912: workt += bs;
913: }
914: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
915: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
916: v += n*bs2;
917: z += bs;
918: }
919: VecRestoreArray(xx,&x);
920: VecRestoreArray(zz,&z);
921: PetscLogFlops(2*a->nz*bs2);
922: return(0);
923: }
927: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
928: {
929: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
930: PetscScalar *xg,*zg,*zb,zero = 0.0;
931: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
932: MatScalar *v;
933: PetscInt mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
935: PetscInt bs=A->bs,j,n,bs2=a->bs2,*ib;
938: VecSet(&zero,zz);
939: VecGetArray(xx,&xg); x = xg;
940: VecGetArray(zz,&zg); z = zg;
942: idx = a->j;
943: v = a->a;
944: ii = a->i;
945: xb = x;
946: switch (bs) {
947: case 1:
948: for (i=0; i<mbs; i++) {
949: n = ii[1] - ii[0]; ii++;
950: x1 = xb[0];
951: ib = idx + ai[i];
952: for (j=0; j<n; j++) {
953: rval = ib[j];
954: z[rval] += *v * x1;
955: v++;
956: }
957: xb++;
958: }
959: break;
960: case 2:
961: for (i=0; i<mbs; i++) {
962: n = ii[1] - ii[0]; ii++;
963: x1 = xb[0]; x2 = xb[1];
964: ib = idx + ai[i];
965: for (j=0; j<n; j++) {
966: rval = ib[j]*2;
967: z[rval++] += v[0]*x1 + v[1]*x2;
968: z[rval] += v[2]*x1 + v[3]*x2;
969: v += 4;
970: }
971: xb += 2;
972: }
973: break;
974: case 3:
975: for (i=0; i<mbs; i++) {
976: n = ii[1] - ii[0]; ii++;
977: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
978: ib = idx + ai[i];
979: for (j=0; j<n; j++) {
980: rval = ib[j]*3;
981: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
982: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
983: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
984: v += 9;
985: }
986: xb += 3;
987: }
988: break;
989: case 4:
990: for (i=0; i<mbs; i++) {
991: n = ii[1] - ii[0]; ii++;
992: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
993: ib = idx + ai[i];
994: for (j=0; j<n; j++) {
995: rval = ib[j]*4;
996: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
997: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
998: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
999: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1000: v += 16;
1001: }
1002: xb += 4;
1003: }
1004: break;
1005: case 5:
1006: for (i=0; i<mbs; i++) {
1007: n = ii[1] - ii[0]; ii++;
1008: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1009: x4 = xb[3]; x5 = xb[4];
1010: ib = idx + ai[i];
1011: for (j=0; j<n; j++) {
1012: rval = ib[j]*5;
1013: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1014: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1015: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1016: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1017: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1018: v += 25;
1019: }
1020: xb += 5;
1021: }
1022: break;
1023: case 6:
1024: for (i=0; i<mbs; i++) {
1025: n = ii[1] - ii[0]; ii++;
1026: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1027: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1028: ib = idx + ai[i];
1029: for (j=0; j<n; j++) {
1030: rval = ib[j]*6;
1031: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1032: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1033: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1034: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1035: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1036: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1037: v += 36;
1038: }
1039: xb += 6;
1040: }
1041: break;
1042: case 7:
1043: for (i=0; i<mbs; i++) {
1044: n = ii[1] - ii[0]; ii++;
1045: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1046: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1047: ib = idx + ai[i];
1048: for (j=0; j<n; j++) {
1049: rval = ib[j]*7;
1050: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1051: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1052: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1053: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1054: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1055: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1056: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1057: v += 49;
1058: }
1059: xb += 7;
1060: }
1061: break;
1062: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
1063: PetscInt ncols,k;
1064: PetscScalar *work,*workt;
1066: if (!a->mult_work) {
1067: k = PetscMax(A->m,A->n);
1068: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1069: }
1070: work = a->mult_work;
1071: for (i=0; i<mbs; i++) {
1072: n = ii[1] - ii[0]; ii++;
1073: ncols = n*bs;
1074: PetscMemzero(work,ncols*sizeof(PetscScalar));
1075: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1076: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1077: v += n*bs2;
1078: x += bs;
1079: workt = work;
1080: for (j=0; j<n; j++) {
1081: zb = z + bs*(*idx++);
1082: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1083: workt += bs;
1084: }
1085: }
1086: }
1087: }
1088: VecRestoreArray(xx,&xg);
1089: VecRestoreArray(zz,&zg);
1090: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1091: return(0);
1092: }
1096: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1098: {
1099: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1100: PetscScalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1101: MatScalar *v;
1103: PetscInt mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=A->bs,j,n,bs2=a->bs2,*ib;
1106: if (yy != zz) { VecCopy(yy,zz); }
1107: VecGetArray(xx,&xg); x = xg;
1108: VecGetArray(zz,&zg); z = zg;
1111: idx = a->j;
1112: v = a->a;
1113: ii = a->i;
1114: xb = x;
1116: switch (bs) {
1117: case 1:
1118: for (i=0; i<mbs; i++) {
1119: n = ii[1] - ii[0]; ii++;
1120: x1 = xb[0];
1121: ib = idx + ai[i];
1122: for (j=0; j<n; j++) {
1123: rval = ib[j];
1124: z[rval] += *v * x1;
1125: v++;
1126: }
1127: xb++;
1128: }
1129: break;
1130: case 2:
1131: for (i=0; i<mbs; i++) {
1132: n = ii[1] - ii[0]; ii++;
1133: x1 = xb[0]; x2 = xb[1];
1134: ib = idx + ai[i];
1135: for (j=0; j<n; j++) {
1136: rval = ib[j]*2;
1137: z[rval++] += v[0]*x1 + v[1]*x2;
1138: z[rval++] += v[2]*x1 + v[3]*x2;
1139: v += 4;
1140: }
1141: xb += 2;
1142: }
1143: break;
1144: case 3:
1145: for (i=0; i<mbs; i++) {
1146: n = ii[1] - ii[0]; ii++;
1147: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1148: ib = idx + ai[i];
1149: for (j=0; j<n; j++) {
1150: rval = ib[j]*3;
1151: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1152: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1153: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1154: v += 9;
1155: }
1156: xb += 3;
1157: }
1158: break;
1159: case 4:
1160: for (i=0; i<mbs; i++) {
1161: n = ii[1] - ii[0]; ii++;
1162: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1163: ib = idx + ai[i];
1164: for (j=0; j<n; j++) {
1165: rval = ib[j]*4;
1166: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1167: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1168: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1169: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1170: v += 16;
1171: }
1172: xb += 4;
1173: }
1174: break;
1175: case 5:
1176: for (i=0; i<mbs; i++) {
1177: n = ii[1] - ii[0]; ii++;
1178: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1179: x4 = xb[3]; x5 = xb[4];
1180: ib = idx + ai[i];
1181: for (j=0; j<n; j++) {
1182: rval = ib[j]*5;
1183: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1184: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1185: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1186: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1187: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1188: v += 25;
1189: }
1190: xb += 5;
1191: }
1192: break;
1193: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1194: PetscInt ncols,k;
1195: PetscScalar *work,*workt;
1197: if (!a->mult_work) {
1198: k = PetscMax(A->m,A->n);
1199: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1200: }
1201: work = a->mult_work;
1202: for (i=0; i<mbs; i++) {
1203: n = ii[1] - ii[0]; ii++;
1204: ncols = n*bs;
1205: PetscMemzero(work,ncols*sizeof(PetscScalar));
1206: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1207: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1208: v += n*bs2;
1209: x += bs;
1210: workt = work;
1211: for (j=0; j<n; j++) {
1212: zb = z + bs*(*idx++);
1213: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1214: workt += bs;
1215: }
1216: }
1217: }
1218: }
1219: VecRestoreArray(xx,&xg);
1220: VecRestoreArray(zz,&zg);
1221: PetscLogFlops(2*a->nz*a->bs2);
1222: return(0);
1223: }
1227: PetscErrorCode MatScale_SeqBAIJ(const PetscScalar *alpha,Mat inA)
1228: {
1229: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1230: PetscInt totalnz = a->bs2*a->nz;
1231: #if defined(PETSC_USE_MAT_SINGLE)
1232: PetscInt i;
1233: #else
1234: PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1235: #endif
1238: #if defined(PETSC_USE_MAT_SINGLE)
1239: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1240: #else
1241: BLscal_(&tnz,(PetscScalar*)alpha,a->a,&one);
1242: #endif
1243: PetscLogFlops(totalnz);
1244: return(0);
1245: }
1249: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1250: {
1251: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1252: MatScalar *v = a->a;
1253: PetscReal sum = 0.0;
1254: PetscInt i,j,k,bs = A->bs,nz=a->nz,bs2=a->bs2,k1;
1257: if (type == NORM_FROBENIUS) {
1258: for (i=0; i< bs2*nz; i++) {
1259: #if defined(PETSC_USE_COMPLEX)
1260: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1261: #else
1262: sum += (*v)*(*v); v++;
1263: #endif
1264: }
1265: *norm = sqrt(sum);
1266: } else if (type == NORM_INFINITY) { /* maximum row sum */
1267: *norm = 0.0;
1268: for (k=0; k<bs; k++) {
1269: for (j=0; j<a->mbs; j++) {
1270: v = a->a + bs2*a->i[j] + k;
1271: sum = 0.0;
1272: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1273: for (k1=0; k1<bs; k1++){
1274: sum += PetscAbsScalar(*v);
1275: v += bs;
1276: }
1277: }
1278: if (sum > *norm) *norm = sum;
1279: }
1280: }
1281: } else {
1282: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1283: }
1284: return(0);
1285: }
1290: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1291: {
1292: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1296: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1297: if ((A->m != B->m) || (A->n != B->n) || (A->bs != B->bs)|| (a->nz != b->nz)) {
1298: *flg = PETSC_FALSE;
1299: return(0);
1300: }
1301:
1302: /* if the a->i are the same */
1303: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1304: if (*flg == PETSC_FALSE) {
1305: return(0);
1306: }
1307:
1308: /* if a->j are the same */
1309: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1310: if (*flg == PETSC_FALSE) {
1311: return(0);
1312: }
1313: /* if a->a are the same */
1314: PetscMemcmp(a->a,b->a,(a->nz)*(A->bs)*(B->bs)*sizeof(PetscScalar),flg);
1315: return(0);
1316:
1317: }
1321: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1322: {
1323: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1325: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1326: PetscScalar *x,zero = 0.0;
1327: MatScalar *aa,*aa_j;
1330: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1331: bs = A->bs;
1332: aa = a->a;
1333: ai = a->i;
1334: aj = a->j;
1335: ambs = a->mbs;
1336: bs2 = a->bs2;
1338: VecSet(&zero,v);
1339: VecGetArray(v,&x);
1340: VecGetLocalSize(v,&n);
1341: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1342: for (i=0; i<ambs; i++) {
1343: for (j=ai[i]; j<ai[i+1]; j++) {
1344: if (aj[j] == i) {
1345: row = i*bs;
1346: aa_j = aa+j*bs2;
1347: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1348: break;
1349: }
1350: }
1351: }
1352: VecRestoreArray(v,&x);
1353: return(0);
1354: }
1358: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1359: {
1360: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1361: PetscScalar *l,*r,x,*li,*ri;
1362: MatScalar *aa,*v;
1364: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1367: ai = a->i;
1368: aj = a->j;
1369: aa = a->a;
1370: m = A->m;
1371: n = A->n;
1372: bs = A->bs;
1373: mbs = a->mbs;
1374: bs2 = a->bs2;
1375: if (ll) {
1376: VecGetArray(ll,&l);
1377: VecGetLocalSize(ll,&lm);
1378: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1379: for (i=0; i<mbs; i++) { /* for each block row */
1380: M = ai[i+1] - ai[i];
1381: li = l + i*bs;
1382: v = aa + bs2*ai[i];
1383: for (j=0; j<M; j++) { /* for each block */
1384: for (k=0; k<bs2; k++) {
1385: (*v++) *= li[k%bs];
1386: }
1387: }
1388: }
1389: VecRestoreArray(ll,&l);
1390: PetscLogFlops(a->nz);
1391: }
1392:
1393: if (rr) {
1394: VecGetArray(rr,&r);
1395: VecGetLocalSize(rr,&rn);
1396: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1397: for (i=0; i<mbs; i++) { /* for each block row */
1398: M = ai[i+1] - ai[i];
1399: v = aa + bs2*ai[i];
1400: for (j=0; j<M; j++) { /* for each block */
1401: ri = r + bs*aj[ai[i]+j];
1402: for (k=0; k<bs; k++) {
1403: x = ri[k];
1404: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1405: }
1406: }
1407: }
1408: VecRestoreArray(rr,&r);
1409: PetscLogFlops(a->nz);
1410: }
1411: return(0);
1412: }
1417: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1418: {
1419: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1422: info->rows_global = (double)A->m;
1423: info->columns_global = (double)A->n;
1424: info->rows_local = (double)A->m;
1425: info->columns_local = (double)A->n;
1426: info->block_size = a->bs2;
1427: info->nz_allocated = a->maxnz;
1428: info->nz_used = a->bs2*a->nz;
1429: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1430: info->assemblies = A->num_ass;
1431: info->mallocs = a->reallocs;
1432: info->memory = A->mem;
1433: if (A->factor) {
1434: info->fill_ratio_given = A->info.fill_ratio_given;
1435: info->fill_ratio_needed = A->info.fill_ratio_needed;
1436: info->factor_mallocs = A->info.factor_mallocs;
1437: } else {
1438: info->fill_ratio_given = 0;
1439: info->fill_ratio_needed = 0;
1440: info->factor_mallocs = 0;
1441: }
1442: return(0);
1443: }
1448: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1449: {
1450: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1454: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1455: return(0);
1456: }