Actual source code: baij2.c
1: #define PETSCMAT_DLL
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/inline/spops.h
5: #include src/inline/ilu.h
6: #include petscbt.h
10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*idx,*nidx,isz,val,ival;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap.bs;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,table);
27: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
28: PetscMalloc((A->rmap.N+1)*sizeof(PetscInt),&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
34:
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n ; ++j){
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(is[i]);
47:
48: k = 0;
49: for (j=0; j<ov; j++){ /* for each overlap*/
50: n = isz;
51: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end ; l++){
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++)
64: nidx2[j*bs+k] = nidx[j]*bs+k;
65: }
66: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
67: }
68: PetscBTDestroy(table);
69: PetscFree(nidx);
70: PetscFree(nidx2);
71: return(0);
72: }
76: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: PetscInt *irow,*icol,nrows,ncols,*ssmap,bs=A->rmap.bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscTruth flag;
89: ISSorted(iscol,(PetscTruth*)&i);
90: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
92: ISGetIndices(isrow,&irow);
93: ISGetIndices(iscol,&icol);
94: ISGetLocalSize(isrow,&nrows);
95: ISGetLocalSize(iscol,&ncols);
97: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
98: ssmap = smap;
99: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
100: PetscMemzero(smap,oldcols*sizeof(PetscInt));
101: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
102: /* determine lens of each row */
103: for (i=0; i<nrows; i++) {
104: kstart = ai[irow[i]];
105: kend = kstart + a->ilen[irow[i]];
106: lens[i] = 0;
107: for (k=kstart; k<kend; k++) {
108: if (ssmap[aj[k]]) {
109: lens[i]++;
110: }
111: }
112: }
113: /* Create and fill new matrix */
114: if (scall == MAT_REUSE_MATRIX) {
115: c = (Mat_SeqBAIJ *)((*B)->data);
117: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap.bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119: if (!flag) {
120: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
121: }
122: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
123: C = *B;
124: } else {
125: MatCreate(((PetscObject)A)->comm,&C);
126: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
127: MatSetType(C,((PetscObject)A)->type_name);
128: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
129: }
130: c = (Mat_SeqBAIJ *)(C->data);
131: for (i=0; i<nrows; i++) {
132: row = irow[i];
133: kstart = ai[row];
134: kend = kstart + a->ilen[row];
135: mat_i = c->i[i];
136: mat_j = c->j + mat_i;
137: mat_a = c->a + mat_i*bs2;
138: mat_ilen = c->ilen + i;
139: for (k=kstart; k<kend; k++) {
140: if ((tcol=ssmap[a->j[k]])) {
141: *mat_j++ = tcol - 1;
142: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
143: mat_a += bs2;
144: (*mat_ilen)++;
145: }
146: }
147: }
148:
149: /* Free work space */
150: ISRestoreIndices(iscol,&icol);
151: PetscFree(smap);
152: PetscFree(lens);
153: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
154: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
155:
156: ISRestoreIndices(isrow,&irow);
157: *B = C;
158: return(0);
159: }
163: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,PetscInt cs,MatReuse scall,Mat *B)
164: {
165: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
166: IS is1,is2;
168: PetscInt *vary,*iary,*irow,*icol,nrows,ncols,i,bs=A->rmap.bs,count;
171: ISGetIndices(isrow,&irow);
172: ISGetIndices(iscol,&icol);
173: ISGetLocalSize(isrow,&nrows);
174: ISGetLocalSize(iscol,&ncols);
175:
176: /* Verify if the indices corespond to each element in a block
177: and form the IS with compressed IS */
178: PetscMalloc(2*(a->mbs+1)*sizeof(PetscInt),&vary);
179: iary = vary + a->mbs;
180: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
181: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
182: count = 0;
183: for (i=0; i<a->mbs; i++) {
184: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
185: if (vary[i]==bs) iary[count++] = i;
186: }
187: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
188:
189: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
190: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
191: count = 0;
192: for (i=0; i<a->mbs; i++) {
193: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
194: if (vary[i]==bs) iary[count++] = i;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
197: ISRestoreIndices(isrow,&irow);
198: ISRestoreIndices(iscol,&icol);
199: PetscFree(vary);
201: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
202: ISDestroy(is1);
203: ISDestroy(is2);
204: return(0);
205: }
209: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
210: {
212: PetscInt i;
215: if (scall == MAT_INITIAL_MATRIX) {
216: PetscMalloc((n+1)*sizeof(Mat),B);
217: }
219: for (i=0; i<n; i++) {
220: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
221: }
222: return(0);
223: }
226: /* -------------------------------------------------------*/
227: /* Should check that shapes of vectors and matrices match */
228: /* -------------------------------------------------------*/
229: #include petscblaslapack.h
233: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
234: {
235: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
236: PetscScalar *z,sum;
237: const PetscScalar *x;
238: const MatScalar *v;
239: PetscErrorCode ierr;
240: PetscInt mbs,i,*idx,*ii,n,*ridx=PETSC_NULL,nonzerorow=0;
241: PetscTruth usecprow=a->compressedrow.use;
244: VecGetArray(xx,(PetscScalar**)&x);
245: VecGetArray(zz,&z);
247: idx = a->j;
248: v = a->a;
249: if (usecprow){
250: mbs = a->compressedrow.nrows;
251: ii = a->compressedrow.i;
252: ridx = a->compressedrow.rindex;
253: } else {
254: mbs = a->mbs;
255: ii = a->i;
256: }
258: for (i=0; i<mbs; i++) {
259: n = ii[1] - ii[0]; ii++;
260: sum = 0.0;
261: nonzerorow += (n>0);
262: while (n--) sum += *v++ * x[*idx++];
263: if (usecprow){
264: z[ridx[i]] = sum;
265: } else {
266: z[i] = sum;
267: }
268: }
269: VecRestoreArray(xx,(PetscScalar**)&x);
270: VecRestoreArray(zz,&z);
271: PetscLogFlops(2*a->nz - nonzerorow);
272: return(0);
273: }
277: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
278: {
279: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
280: PetscScalar *z = 0,sum1,sum2,*zarray;
281: const PetscScalar *x,*xb;
282: PetscScalar x1,x2;
283: const MatScalar *v;
284: PetscErrorCode ierr;
285: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
286: PetscTruth usecprow=a->compressedrow.use;
289: VecGetArray(xx,(PetscScalar**)&x);
290: VecGetArray(zz,&zarray);
292: idx = a->j;
293: v = a->a;
294: if (usecprow){
295: mbs = a->compressedrow.nrows;
296: ii = a->compressedrow.i;
297: ridx = a->compressedrow.rindex;
298: } else {
299: mbs = a->mbs;
300: ii = a->i;
301: z = zarray;
302: }
304: for (i=0; i<mbs; i++) {
305: n = ii[1] - ii[0]; ii++;
306: sum1 = 0.0; sum2 = 0.0;
307: nonzerorow += (n>0);
308: for (j=0; j<n; j++) {
309: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
310: sum1 += v[0]*x1 + v[2]*x2;
311: sum2 += v[1]*x1 + v[3]*x2;
312: v += 4;
313: }
314: if (usecprow) z = zarray + 2*ridx[i];
315: z[0] = sum1; z[1] = sum2;
316: if (!usecprow) z += 2;
317: }
318: VecRestoreArray(xx,(PetscScalar**)&x);
319: VecRestoreArray(zz,&zarray);
320: PetscLogFlops(8*a->nz - 2*nonzerorow);
321: return(0);
322: }
326: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
327: {
328: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
329: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
330: const PetscScalar *x,*xb;
331: const MatScalar *v;
332: PetscErrorCode ierr;
333: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
334: PetscTruth usecprow=a->compressedrow.use;
335:
337: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
338: #pragma disjoint(*v,*z,*xb)
339: #endif
342: VecGetArray(xx,(PetscScalar**)&x);
343: VecGetArray(zz,&zarray);
345: idx = a->j;
346: v = a->a;
347: if (usecprow){
348: mbs = a->compressedrow.nrows;
349: ii = a->compressedrow.i;
350: ridx = a->compressedrow.rindex;
351: } else {
352: mbs = a->mbs;
353: ii = a->i;
354: z = zarray;
355: }
357: for (i=0; i<mbs; i++) {
358: n = ii[1] - ii[0]; ii++;
359: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
360: nonzerorow += (n>0);
361: for (j=0; j<n; j++) {
362: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
363: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
364: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
365: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
366: v += 9;
367: }
368: if (usecprow) z = zarray + 3*ridx[i];
369: z[0] = sum1; z[1] = sum2; z[2] = sum3;
370: if (!usecprow) z += 3;
371: }
372: VecRestoreArray(xx,(PetscScalar**)&x);
373: VecRestoreArray(zz,&zarray);
374: PetscLogFlops(18*a->nz - 3*nonzerorow);
375: return(0);
376: }
380: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
381: {
382: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
383: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
384: const PetscScalar *x,*xb;
385: const MatScalar *v;
386: PetscErrorCode ierr;
387: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
388: PetscTruth usecprow=a->compressedrow.use;
391: VecGetArray(xx,(PetscScalar**)&x);
392: VecGetArray(zz,&zarray);
394: idx = a->j;
395: v = a->a;
396: if (usecprow){
397: mbs = a->compressedrow.nrows;
398: ii = a->compressedrow.i;
399: ridx = a->compressedrow.rindex;
400: } else {
401: mbs = a->mbs;
402: ii = a->i;
403: z = zarray;
404: }
406: for (i=0; i<mbs; i++) {
407: n = ii[1] - ii[0]; ii++;
408: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
409: nonzerorow += (n>0);
410: for (j=0; j<n; j++) {
411: xb = x + 4*(*idx++);
412: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
413: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
414: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
415: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
416: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
417: v += 16;
418: }
419: if (usecprow) z = zarray + 4*ridx[i];
420: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
421: if (!usecprow) z += 4;
422: }
423: VecRestoreArray(xx,(PetscScalar**)&x);
424: VecRestoreArray(zz,&zarray);
425: PetscLogFlops(32*a->nz - 4*nonzerorow);
426: return(0);
427: }
431: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
432: {
433: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
434: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
435: const PetscScalar *xb,*x;
436: const MatScalar *v;
437: PetscErrorCode ierr;
438: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
439: PetscTruth usecprow=a->compressedrow.use;
442: VecGetArray(xx,(PetscScalar**)&x);
443: VecGetArray(zz,&zarray);
445: idx = a->j;
446: v = a->a;
447: if (usecprow){
448: mbs = a->compressedrow.nrows;
449: ii = a->compressedrow.i;
450: ridx = a->compressedrow.rindex;
451: } else {
452: mbs = a->mbs;
453: ii = a->i;
454: z = zarray;
455: }
457: for (i=0; i<mbs; i++) {
458: n = ii[1] - ii[0]; ii++;
459: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
460: nonzerorow += (n>0);
461: for (j=0; j<n; j++) {
462: xb = x + 5*(*idx++);
463: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
464: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
465: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
466: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
467: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
468: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
469: v += 25;
470: }
471: if (usecprow) z = zarray + 5*ridx[i];
472: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
473: if (!usecprow) z += 5;
474: }
475: VecRestoreArray(xx,(PetscScalar**)&x);
476: VecRestoreArray(zz,&zarray);
477: PetscLogFlops(50*a->nz - 5*nonzerorow);
478: return(0);
479: }
484: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
485: {
486: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
487: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
488: const PetscScalar *x,*xb;
489: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
490: const MatScalar *v;
491: PetscErrorCode ierr;
492: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
493: PetscTruth usecprow=a->compressedrow.use;
496: VecGetArray(xx,(PetscScalar**)&x);
497: VecGetArray(zz,&zarray);
499: idx = a->j;
500: v = a->a;
501: if (usecprow){
502: mbs = a->compressedrow.nrows;
503: ii = a->compressedrow.i;
504: ridx = a->compressedrow.rindex;
505: } else {
506: mbs = a->mbs;
507: ii = a->i;
508: z = zarray;
509: }
511: for (i=0; i<mbs; i++) {
512: n = ii[1] - ii[0]; ii++;
513: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
514: nonzerorow += (n>0);
515: for (j=0; j<n; j++) {
516: xb = x + 6*(*idx++);
517: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
518: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
519: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
520: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
521: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
522: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
523: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
524: v += 36;
525: }
526: if (usecprow) z = zarray + 6*ridx[i];
527: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
528: if (!usecprow) z += 6;
529: }
531: VecRestoreArray(xx,(PetscScalar**)&x);
532: VecRestoreArray(zz,&zarray);
533: PetscLogFlops(72*a->nz - 6*nonzerorow);
534: return(0);
535: }
538: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
539: {
540: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
541: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
542: const PetscScalar *x,*xb;
543: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
544: const MatScalar *v;
545: PetscErrorCode ierr;
546: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
547: PetscTruth usecprow=a->compressedrow.use;
550: VecGetArray(xx,(PetscScalar**)&x);
551: VecGetArray(zz,&zarray);
553: idx = a->j;
554: v = a->a;
555: if (usecprow){
556: mbs = a->compressedrow.nrows;
557: ii = a->compressedrow.i;
558: ridx = a->compressedrow.rindex;
559: } else {
560: mbs = a->mbs;
561: ii = a->i;
562: z = zarray;
563: }
565: for (i=0; i<mbs; i++) {
566: n = ii[1] - ii[0]; ii++;
567: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
568: nonzerorow += (n>0);
569: for (j=0; j<n; j++) {
570: xb = x + 7*(*idx++);
571: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
572: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
573: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
574: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
575: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
576: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
577: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
578: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
579: v += 49;
580: }
581: if (usecprow) z = zarray + 7*ridx[i];
582: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
583: if (!usecprow) z += 7;
584: }
586: VecRestoreArray(xx,(PetscScalar**)&x);
587: VecRestoreArray(zz,&zarray);
588: PetscLogFlops(98*a->nz - 7*nonzerorow);
589: return(0);
590: }
592: /*
593: This will not work with MatScalar == float because it calls the BLAS
594: */
597: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
598: {
599: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
600: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
601: MatScalar *v;
603: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
604: PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
605: PetscTruth usecprow=a->compressedrow.use;
608: VecGetArray(xx,&x);
609: VecGetArray(zz,&zarray);
611: idx = a->j;
612: v = a->a;
613: if (usecprow){
614: mbs = a->compressedrow.nrows;
615: ii = a->compressedrow.i;
616: ridx = a->compressedrow.rindex;
617: } else {
618: mbs = a->mbs;
619: ii = a->i;
620: z = zarray;
621: }
623: if (!a->mult_work) {
624: k = PetscMax(A->rmap.n,A->cmap.n);
625: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
626: }
627: work = a->mult_work;
628: for (i=0; i<mbs; i++) {
629: n = ii[1] - ii[0]; ii++;
630: ncols = n*bs;
631: workt = work;
632: nonzerorow += (n>0);
633: for (j=0; j<n; j++) {
634: xb = x + bs*(*idx++);
635: for (k=0; k<bs; k++) workt[k] = xb[k];
636: workt += bs;
637: }
638: if (usecprow) z = zarray + bs*ridx[i];
639: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
640: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
641: v += n*bs2;
642: if (!usecprow) z += bs;
643: }
644: VecRestoreArray(xx,&x);
645: VecRestoreArray(zz,&zarray);
646: PetscLogFlops(2*a->nz*bs2 - bs*nonzerorow);
647: return(0);
648: }
653: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
654: {
655: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
656: PetscScalar *x,*y = 0,*z = 0,sum,*yarray,*zarray;
657: MatScalar *v;
659: PetscInt mbs=a->mbs,i,*idx,*ii,n,*ridx=PETSC_NULL;
660: PetscTruth usecprow=a->compressedrow.use;
663: VecGetArray(xx,&x);
664: VecGetArray(yy,&yarray);
665: if (zz != yy) {
666: VecGetArray(zz,&zarray);
667: } else {
668: zarray = yarray;
669: }
671: idx = a->j;
672: v = a->a;
673: if (usecprow){
674: if (zz != yy){
675: PetscMemcpy(zarray,yarray,mbs*sizeof(PetscScalar));
676: }
677: mbs = a->compressedrow.nrows;
678: ii = a->compressedrow.i;
679: ridx = a->compressedrow.rindex;
680: } else {
681: ii = a->i;
682: y = yarray;
683: z = zarray;
684: }
686: for (i=0; i<mbs; i++) {
687: n = ii[1] - ii[0]; ii++;
688: if (usecprow){
689: z = zarray + ridx[i];
690: y = yarray + ridx[i];
691: }
692: sum = y[0];
693: while (n--) sum += *v++ * x[*idx++];
694: z[0] = sum;
695: if (!usecprow){
696: z++; y++;
697: }
698: }
699: VecRestoreArray(xx,&x);
700: VecRestoreArray(yy,&yarray);
701: if (zz != yy) {
702: VecRestoreArray(zz,&zarray);
703: }
704: PetscLogFlops(2*a->nz);
705: return(0);
706: }
710: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
711: {
712: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
713: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
714: PetscScalar x1,x2,*yarray,*zarray;
715: MatScalar *v;
717: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
718: PetscTruth usecprow=a->compressedrow.use;
721: VecGetArray(xx,&x);
722: VecGetArray(yy,&yarray);
723: if (zz != yy) {
724: VecGetArray(zz,&zarray);
725: } else {
726: zarray = yarray;
727: }
729: idx = a->j;
730: v = a->a;
731: if (usecprow){
732: if (zz != yy){
733: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
734: }
735: mbs = a->compressedrow.nrows;
736: ii = a->compressedrow.i;
737: ridx = a->compressedrow.rindex;
738: if (zz != yy){
739: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
740: }
741: } else {
742: ii = a->i;
743: y = yarray;
744: z = zarray;
745: }
747: for (i=0; i<mbs; i++) {
748: n = ii[1] - ii[0]; ii++;
749: if (usecprow){
750: z = zarray + 2*ridx[i];
751: y = yarray + 2*ridx[i];
752: }
753: sum1 = y[0]; sum2 = y[1];
754: for (j=0; j<n; j++) {
755: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
756: sum1 += v[0]*x1 + v[2]*x2;
757: sum2 += v[1]*x1 + v[3]*x2;
758: v += 4;
759: }
760: z[0] = sum1; z[1] = sum2;
761: if (!usecprow){
762: z += 2; y += 2;
763: }
764: }
765: VecRestoreArray(xx,&x);
766: VecRestoreArray(yy,&yarray);
767: if (zz != yy) {
768: VecRestoreArray(zz,&zarray);
769: }
770: PetscLogFlops(4*a->nz);
771: return(0);
772: }
776: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
777: {
778: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
779: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
780: MatScalar *v;
782: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
783: PetscTruth usecprow=a->compressedrow.use;
786: VecGetArray(xx,&x);
787: VecGetArray(yy,&yarray);
788: if (zz != yy) {
789: VecGetArray(zz,&zarray);
790: } else {
791: zarray = yarray;
792: }
794: idx = a->j;
795: v = a->a;
796: if (usecprow){
797: if (zz != yy){
798: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
799: }
800: mbs = a->compressedrow.nrows;
801: ii = a->compressedrow.i;
802: ridx = a->compressedrow.rindex;
803: } else {
804: ii = a->i;
805: y = yarray;
806: z = zarray;
807: }
809: for (i=0; i<mbs; i++) {
810: n = ii[1] - ii[0]; ii++;
811: if (usecprow){
812: z = zarray + 3*ridx[i];
813: y = yarray + 3*ridx[i];
814: }
815: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
816: for (j=0; j<n; j++) {
817: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
818: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
819: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
820: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
821: v += 9;
822: }
823: z[0] = sum1; z[1] = sum2; z[2] = sum3;
824: if (!usecprow){
825: z += 3; y += 3;
826: }
827: }
828: VecRestoreArray(xx,&x);
829: VecRestoreArray(yy,&yarray);
830: if (zz != yy) {
831: VecRestoreArray(zz,&zarray);
832: }
833: PetscLogFlops(18*a->nz);
834: return(0);
835: }
839: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
840: {
841: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
842: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
843: MatScalar *v;
845: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
846: PetscTruth usecprow=a->compressedrow.use;
849: VecGetArray(xx,&x);
850: VecGetArray(yy,&yarray);
851: if (zz != yy) {
852: VecGetArray(zz,&zarray);
853: } else {
854: zarray = yarray;
855: }
857: idx = a->j;
858: v = a->a;
859: if (usecprow){
860: if (zz != yy){
861: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
862: }
863: mbs = a->compressedrow.nrows;
864: ii = a->compressedrow.i;
865: ridx = a->compressedrow.rindex;
866: } else {
867: ii = a->i;
868: y = yarray;
869: z = zarray;
870: }
872: for (i=0; i<mbs; i++) {
873: n = ii[1] - ii[0]; ii++;
874: if (usecprow){
875: z = zarray + 4*ridx[i];
876: y = yarray + 4*ridx[i];
877: }
878: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
879: for (j=0; j<n; j++) {
880: xb = x + 4*(*idx++);
881: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
882: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
883: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
884: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
885: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
886: v += 16;
887: }
888: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
889: if (!usecprow){
890: z += 4; y += 4;
891: }
892: }
893: VecRestoreArray(xx,&x);
894: VecRestoreArray(yy,&yarray);
895: if (zz != yy) {
896: VecRestoreArray(zz,&zarray);
897: }
898: PetscLogFlops(32*a->nz);
899: return(0);
900: }
904: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
905: {
906: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
907: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
908: PetscScalar *yarray,*zarray;
909: MatScalar *v;
911: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
912: PetscTruth usecprow=a->compressedrow.use;
915: VecGetArray(xx,&x);
916: VecGetArray(yy,&yarray);
917: if (zz != yy) {
918: VecGetArray(zz,&zarray);
919: } else {
920: zarray = yarray;
921: }
923: idx = a->j;
924: v = a->a;
925: if (usecprow){
926: if (zz != yy){
927: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
928: }
929: mbs = a->compressedrow.nrows;
930: ii = a->compressedrow.i;
931: ridx = a->compressedrow.rindex;
932: } else {
933: ii = a->i;
934: y = yarray;
935: z = zarray;
936: }
938: for (i=0; i<mbs; i++) {
939: n = ii[1] - ii[0]; ii++;
940: if (usecprow){
941: z = zarray + 5*ridx[i];
942: y = yarray + 5*ridx[i];
943: }
944: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
945: for (j=0; j<n; j++) {
946: xb = x + 5*(*idx++);
947: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
948: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
949: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
950: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
951: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
952: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
953: v += 25;
954: }
955: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
956: if (!usecprow){
957: z += 5; y += 5;
958: }
959: }
960: VecRestoreArray(xx,&x);
961: VecRestoreArray(yy,&yarray);
962: if (zz != yy) {
963: VecRestoreArray(zz,&zarray);
964: }
965: PetscLogFlops(50*a->nz);
966: return(0);
967: }
970: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
971: {
972: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
973: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
974: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
975: MatScalar *v;
977: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
978: PetscTruth usecprow=a->compressedrow.use;
981: VecGetArray(xx,&x);
982: VecGetArray(yy,&yarray);
983: if (zz != yy) {
984: VecGetArray(zz,&zarray);
985: } else {
986: zarray = yarray;
987: }
989: idx = a->j;
990: v = a->a;
991: if (usecprow){
992: if (zz != yy){
993: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
994: }
995: mbs = a->compressedrow.nrows;
996: ii = a->compressedrow.i;
997: ridx = a->compressedrow.rindex;
998: } else {
999: ii = a->i;
1000: y = yarray;
1001: z = zarray;
1002: }
1004: for (i=0; i<mbs; i++) {
1005: n = ii[1] - ii[0]; ii++;
1006: if (usecprow){
1007: z = zarray + 6*ridx[i];
1008: y = yarray + 6*ridx[i];
1009: }
1010: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1011: for (j=0; j<n; j++) {
1012: xb = x + 6*(*idx++);
1013: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1014: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1015: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1016: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1017: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1018: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1019: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1020: v += 36;
1021: }
1022: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1023: if (!usecprow){
1024: z += 6; y += 6;
1025: }
1026: }
1027: VecRestoreArray(xx,&x);
1028: VecRestoreArray(yy,&yarray);
1029: if (zz != yy) {
1030: VecRestoreArray(zz,&zarray);
1031: }
1032: PetscLogFlops(72*a->nz);
1033: return(0);
1034: }
1038: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1039: {
1040: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1041: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1042: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1043: MatScalar *v;
1045: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1046: PetscTruth usecprow=a->compressedrow.use;
1049: VecGetArray(xx,&x);
1050: VecGetArray(yy,&yarray);
1051: if (zz != yy) {
1052: VecGetArray(zz,&zarray);
1053: } else {
1054: zarray = yarray;
1055: }
1057: idx = a->j;
1058: v = a->a;
1059: if (usecprow){
1060: if (zz != yy){
1061: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1062: }
1063: mbs = a->compressedrow.nrows;
1064: ii = a->compressedrow.i;
1065: ridx = a->compressedrow.rindex;
1066: } else {
1067: ii = a->i;
1068: y = yarray;
1069: z = zarray;
1070: }
1072: for (i=0; i<mbs; i++) {
1073: n = ii[1] - ii[0]; ii++;
1074: if (usecprow){
1075: z = zarray + 7*ridx[i];
1076: y = yarray + 7*ridx[i];
1077: }
1078: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1079: for (j=0; j<n; j++) {
1080: xb = x + 7*(*idx++);
1081: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1082: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1083: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1084: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1085: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1086: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1087: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1088: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1089: v += 49;
1090: }
1091: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1092: if (!usecprow){
1093: z += 7; y += 7;
1094: }
1095: }
1096: VecRestoreArray(xx,&x);
1097: VecRestoreArray(yy,&yarray);
1098: if (zz != yy) {
1099: VecRestoreArray(zz,&zarray);
1100: }
1101: PetscLogFlops(98*a->nz);
1102: return(0);
1103: }
1107: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1108: {
1109: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1110: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1111: MatScalar *v;
1113: PetscInt mbs,i,*idx,*ii,bs=A->rmap.bs,j,n,bs2=a->bs2;
1114: PetscInt ncols,k,*ridx=PETSC_NULL;
1115: PetscTruth usecprow=a->compressedrow.use;
1118: VecCopy_Seq(yy,zz);
1119: VecGetArray(xx,&x);
1120: VecGetArray(zz,&zarray);
1122: idx = a->j;
1123: v = a->a;
1124: if (usecprow){
1125: mbs = a->compressedrow.nrows;
1126: ii = a->compressedrow.i;
1127: ridx = a->compressedrow.rindex;
1128: } else {
1129: mbs = a->mbs;
1130: ii = a->i;
1131: z = zarray;
1132: }
1134: if (!a->mult_work) {
1135: k = PetscMax(A->rmap.n,A->cmap.n);
1136: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1137: }
1138: work = a->mult_work;
1139: for (i=0; i<mbs; i++) {
1140: n = ii[1] - ii[0]; ii++;
1141: ncols = n*bs;
1142: workt = work;
1143: for (j=0; j<n; j++) {
1144: xb = x + bs*(*idx++);
1145: for (k=0; k<bs; k++) workt[k] = xb[k];
1146: workt += bs;
1147: }
1148: if (usecprow) z = zarray + bs*ridx[i];
1149: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1150: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1151: v += n*bs2;
1152: if (!usecprow){
1153: z += bs;
1154: }
1155: }
1156: VecRestoreArray(xx,&x);
1157: VecRestoreArray(zz,&zarray);
1158: PetscLogFlops(2*a->nz*bs2);
1159: return(0);
1160: }
1164: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1165: {
1166: PetscScalar zero = 0.0;
1170: VecSet(zz,zero);
1171: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1172: return(0);
1173: }
1177: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1179: {
1180: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1181: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1182: MatScalar *v;
1183: PetscErrorCode ierr;
1184: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap.bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1185: Mat_CompressedRow cprow = a->compressedrow;
1186: PetscTruth usecprow=cprow.use;
1189: if (yy != zz) { VecCopy_Seq(yy,zz); }
1190: VecGetArray(xx,&x);
1191: VecGetArray(zz,&z);
1193: idx = a->j;
1194: v = a->a;
1195: if (usecprow){
1196: mbs = cprow.nrows;
1197: ii = cprow.i;
1198: ridx = cprow.rindex;
1199: } else {
1200: mbs=a->mbs;
1201: ii = a->i;
1202: xb = x;
1203: }
1205: switch (bs) {
1206: case 1:
1207: for (i=0; i<mbs; i++) {
1208: if (usecprow) xb = x + ridx[i];
1209: x1 = xb[0];
1210: ib = idx + ii[0];
1211: n = ii[1] - ii[0]; ii++;
1212: for (j=0; j<n; j++) {
1213: rval = ib[j];
1214: z[rval] += *v * x1;
1215: v++;
1216: }
1217: if (!usecprow) xb++;
1218: }
1219: break;
1220: case 2:
1221: for (i=0; i<mbs; i++) {
1222: if (usecprow) xb = x + 2*ridx[i];
1223: x1 = xb[0]; x2 = xb[1];
1224: ib = idx + ii[0];
1225: n = ii[1] - ii[0]; ii++;
1226: for (j=0; j<n; j++) {
1227: rval = ib[j]*2;
1228: z[rval++] += v[0]*x1 + v[1]*x2;
1229: z[rval++] += v[2]*x1 + v[3]*x2;
1230: v += 4;
1231: }
1232: if (!usecprow) xb += 2;
1233: }
1234: break;
1235: case 3:
1236: for (i=0; i<mbs; i++) {
1237: if (usecprow) xb = x + 3*ridx[i];
1238: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1239: ib = idx + ii[0];
1240: n = ii[1] - ii[0]; ii++;
1241: for (j=0; j<n; j++) {
1242: rval = ib[j]*3;
1243: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1244: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1245: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1246: v += 9;
1247: }
1248: if (!usecprow) xb += 3;
1249: }
1250: break;
1251: case 4:
1252: for (i=0; i<mbs; i++) {
1253: if (usecprow) xb = x + 4*ridx[i];
1254: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1255: ib = idx + ii[0];
1256: n = ii[1] - ii[0]; ii++;
1257: for (j=0; j<n; j++) {
1258: rval = ib[j]*4;
1259: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1260: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1261: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1262: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1263: v += 16;
1264: }
1265: if (!usecprow) xb += 4;
1266: }
1267: break;
1268: case 5:
1269: for (i=0; i<mbs; i++) {
1270: if (usecprow) xb = x + 5*ridx[i];
1271: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1272: x4 = xb[3]; x5 = xb[4];
1273: ib = idx + ii[0];
1274: n = ii[1] - ii[0]; ii++;
1275: for (j=0; j<n; j++) {
1276: rval = ib[j]*5;
1277: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1278: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1279: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1280: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1281: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1282: v += 25;
1283: }
1284: if (!usecprow) xb += 5;
1285: }
1286: break;
1287: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1288: PetscInt ncols,k;
1289: PetscScalar *work,*workt,*xtmp;
1291: if (!a->mult_work) {
1292: k = PetscMax(A->rmap.n,A->cmap.n);
1293: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1294: }
1295: work = a->mult_work;
1296: xtmp = x;
1297: for (i=0; i<mbs; i++) {
1298: n = ii[1] - ii[0]; ii++;
1299: ncols = n*bs;
1300: PetscMemzero(work,ncols*sizeof(PetscScalar));
1301: if (usecprow) {
1302: xtmp = x + bs*ridx[i];
1303: }
1304: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1305: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1306: v += n*bs2;
1307: if (!usecprow) xtmp += bs;
1308: workt = work;
1309: for (j=0; j<n; j++) {
1310: zb = z + bs*(*idx++);
1311: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1312: workt += bs;
1313: }
1314: }
1315: }
1316: }
1317: VecRestoreArray(xx,&x);
1318: VecRestoreArray(zz,&z);
1319: PetscLogFlops(2*a->nz*a->bs2);
1320: return(0);
1321: }
1325: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1326: {
1327: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1328: PetscInt totalnz = a->bs2*a->nz;
1329: PetscScalar oalpha = alpha;
1331: #if defined(PETSC_USE_MAT_SINGLE)
1332: PetscInt i;
1333: #else
1334: PetscBLASInt tnz = (PetscBLASInt) totalnz,one = 1;
1335: #endif
1338: #if defined(PETSC_USE_MAT_SINGLE)
1339: for (i=0; i<totalnz; i++) a->a[i] *= oalpha;
1340: #else
1341: BLASscal_(&tnz,&oalpha,a->a,&one);
1342: #endif
1343: PetscLogFlops(totalnz);
1344: return(0);
1345: }
1349: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1350: {
1352: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1353: MatScalar *v = a->a;
1354: PetscReal sum = 0.0;
1355: PetscInt i,j,k,bs=A->rmap.bs,nz=a->nz,bs2=a->bs2,k1;
1358: if (type == NORM_FROBENIUS) {
1359: for (i=0; i< bs2*nz; i++) {
1360: #if defined(PETSC_USE_COMPLEX)
1361: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1362: #else
1363: sum += (*v)*(*v); v++;
1364: #endif
1365: }
1366: *norm = sqrt(sum);
1367: } else if (type == NORM_1) { /* maximum column sum */
1368: PetscReal *tmp;
1369: PetscInt *bcol = a->j;
1370: PetscMalloc((A->cmap.n+1)*sizeof(PetscReal),&tmp);
1371: PetscMemzero(tmp,A->cmap.n*sizeof(PetscReal));
1372: for (i=0; i<nz; i++){
1373: for (j=0; j<bs; j++){
1374: k1 = bs*(*bcol) + j; /* column index */
1375: for (k=0; k<bs; k++){
1376: tmp[k1] += PetscAbsScalar(*v); v++;
1377: }
1378: }
1379: bcol++;
1380: }
1381: *norm = 0.0;
1382: for (j=0; j<A->cmap.n; j++) {
1383: if (tmp[j] > *norm) *norm = tmp[j];
1384: }
1385: PetscFree(tmp);
1386: } else if (type == NORM_INFINITY) { /* maximum row sum */
1387: *norm = 0.0;
1388: for (k=0; k<bs; k++) {
1389: for (j=0; j<a->mbs; j++) {
1390: v = a->a + bs2*a->i[j] + k;
1391: sum = 0.0;
1392: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1393: for (k1=0; k1<bs; k1++){
1394: sum += PetscAbsScalar(*v);
1395: v += bs;
1396: }
1397: }
1398: if (sum > *norm) *norm = sum;
1399: }
1400: }
1401: } else {
1402: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1403: }
1404: return(0);
1405: }
1410: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1411: {
1412: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1416: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1417: if ((A->rmap.N != B->rmap.N) || (A->cmap.n != B->cmap.n) || (A->rmap.bs != B->rmap.bs)|| (a->nz != b->nz)) {
1418: *flg = PETSC_FALSE;
1419: return(0);
1420: }
1421:
1422: /* if the a->i are the same */
1423: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1424: if (!*flg) {
1425: return(0);
1426: }
1427:
1428: /* if a->j are the same */
1429: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1430: if (!*flg) {
1431: return(0);
1432: }
1433: /* if a->a are the same */
1434: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap.bs)*(B->rmap.bs)*sizeof(PetscScalar),flg);
1435: return(0);
1436:
1437: }
1441: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1442: {
1443: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1445: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1446: PetscScalar *x,zero = 0.0;
1447: MatScalar *aa,*aa_j;
1450: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1451: bs = A->rmap.bs;
1452: aa = a->a;
1453: ai = a->i;
1454: aj = a->j;
1455: ambs = a->mbs;
1456: bs2 = a->bs2;
1458: VecSet(v,zero);
1459: VecGetArray(v,&x);
1460: VecGetLocalSize(v,&n);
1461: if (n != A->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1462: for (i=0; i<ambs; i++) {
1463: for (j=ai[i]; j<ai[i+1]; j++) {
1464: if (aj[j] == i) {
1465: row = i*bs;
1466: aa_j = aa+j*bs2;
1467: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1468: break;
1469: }
1470: }
1471: }
1472: VecRestoreArray(v,&x);
1473: return(0);
1474: }
1478: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1479: {
1480: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1481: PetscScalar *l,*r,x,*li,*ri;
1482: MatScalar *aa,*v;
1484: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1487: ai = a->i;
1488: aj = a->j;
1489: aa = a->a;
1490: m = A->rmap.n;
1491: n = A->cmap.n;
1492: bs = A->rmap.bs;
1493: mbs = a->mbs;
1494: bs2 = a->bs2;
1495: if (ll) {
1496: VecGetArray(ll,&l);
1497: VecGetLocalSize(ll,&lm);
1498: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1499: for (i=0; i<mbs; i++) { /* for each block row */
1500: M = ai[i+1] - ai[i];
1501: li = l + i*bs;
1502: v = aa + bs2*ai[i];
1503: for (j=0; j<M; j++) { /* for each block */
1504: for (k=0; k<bs2; k++) {
1505: (*v++) *= li[k%bs];
1506: }
1507: }
1508: }
1509: VecRestoreArray(ll,&l);
1510: PetscLogFlops(a->nz);
1511: }
1512:
1513: if (rr) {
1514: VecGetArray(rr,&r);
1515: VecGetLocalSize(rr,&rn);
1516: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1517: for (i=0; i<mbs; i++) { /* for each block row */
1518: M = ai[i+1] - ai[i];
1519: v = aa + bs2*ai[i];
1520: for (j=0; j<M; j++) { /* for each block */
1521: ri = r + bs*aj[ai[i]+j];
1522: for (k=0; k<bs; k++) {
1523: x = ri[k];
1524: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1525: }
1526: }
1527: }
1528: VecRestoreArray(rr,&r);
1529: PetscLogFlops(a->nz);
1530: }
1531: return(0);
1532: }
1537: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1538: {
1539: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1542: info->rows_global = (double)A->rmap.N;
1543: info->columns_global = (double)A->cmap.n;
1544: info->rows_local = (double)A->rmap.n;
1545: info->columns_local = (double)A->cmap.n;
1546: info->block_size = a->bs2;
1547: info->nz_allocated = a->maxnz;
1548: info->nz_used = a->bs2*a->nz;
1549: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1550: info->assemblies = A->num_ass;
1551: info->mallocs = a->reallocs;
1552: info->memory = ((PetscObject)A)->mem;
1553: if (A->factor) {
1554: info->fill_ratio_given = A->info.fill_ratio_given;
1555: info->fill_ratio_needed = A->info.fill_ratio_needed;
1556: info->factor_mallocs = A->info.factor_mallocs;
1557: } else {
1558: info->fill_ratio_given = 0;
1559: info->fill_ratio_needed = 0;
1560: info->factor_mallocs = 0;
1561: }
1562: return(0);
1563: }
1568: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
1569: {
1570: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1574: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1575: return(0);
1576: }