Actual source code: baij2.c
1: /*$Id: baij2.c,v 1.75 2001/09/07 20:09:49 bsmith Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/vec/vecimpl.h
5: #include src/inline/spops.h
6: #include src/inline/ilu.h
7: #include petscbt.h
9: int MatIncreaseOverlap_SeqBAIJ(Mat A,int is_max,IS *is,int ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
12: int row,i,j,k,l,m,n,*idx,ierr,*nidx,isz,val,ival;
13: int start,end,*ai,*aj,bs,*nidx2;
14: PetscBT table;
17: m = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = a->bs;
22: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
24: PetscBTCreate(m,table);
25: PetscMalloc((m+1)*sizeof(int),&nidx);
26: PetscMalloc((A->m+1)*sizeof(int),&nidx2);
28: for (i=0; i<is_max; i++) {
29: /* Initialise the two local arrays */
30: isz = 0;
31: PetscBTMemzero(m,table);
32:
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
38: for (j=0; j<n ; ++j){
39: ival = idx[j]/bs; /* convert the indices into block indices */
40: if (ival>m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
42: }
43: ISRestoreIndices(is[i],&idx);
44: ISDestroy(is[i]);
45:
46: k = 0;
47: for (j=0; j<ov; j++){ /* for each overlap*/
48: n = isz;
49: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
50: row = nidx[k];
51: start = ai[row];
52: end = ai[row+1];
53: for (l = start; l<end ; l++){
54: val = aj[l];
55: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
56: }
57: }
58: }
59: /* expand the Index Set */
60: for (j=0; j<isz; j++) {
61: for (k=0; k<bs; k++)
62: nidx2[j*bs+k] = nidx[j]*bs+k;
63: }
64: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
65: }
66: PetscBTDestroy(table);
67: PetscFree(nidx);
68: PetscFree(nidx2);
69: return(0);
70: }
72: int MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
73: {
74: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
75: int *smap,i,k,kstart,kend,ierr,oldcols = a->nbs,*lens;
76: int row,mat_i,*mat_j,tcol,*mat_ilen;
77: int *irow,*icol,nrows,ncols,*ssmap,bs=a->bs,bs2=a->bs2;
78: int *aj = a->j,*ai = a->i;
79: MatScalar *mat_a;
80: Mat C;
81: PetscTruth flag;
84: ISSorted(iscol,(PetscTruth*)&i);
85: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
87: ISGetIndices(isrow,&irow);
88: ISGetIndices(iscol,&icol);
89: ISGetLocalSize(isrow,&nrows);
90: ISGetLocalSize(iscol,&ncols);
92: PetscMalloc((1+oldcols)*sizeof(int),&smap);
93: ssmap = smap;
94: PetscMalloc((1+nrows)*sizeof(int),&lens);
95: ierr = PetscMemzero(smap,oldcols*sizeof(int));
96: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
97: /* determine lens of each row */
98: for (i=0; i<nrows; i++) {
99: kstart = ai[irow[i]];
100: kend = kstart + a->ilen[irow[i]];
101: lens[i] = 0;
102: for (k=kstart; k<kend; k++) {
103: if (ssmap[aj[k]]) {
104: lens[i]++;
105: }
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ *)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
114: if (flag == PETSC_FALSE) {
115: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
116: }
117: PetscMemzero(c->ilen,c->mbs*sizeof(int));
118: C = *B;
119: } else {
120: MatCreateSeqBAIJ(A->comm,bs,nrows*bs,ncols*bs,0,lens,&C);
121: }
122: c = (Mat_SeqBAIJ *)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140:
141: /* Free work space */
142: ISRestoreIndices(iscol,&icol);
143: PetscFree(smap);
144: PetscFree(lens);
145: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
146: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
147:
148: ISRestoreIndices(isrow,&irow);
149: *B = C;
150: return(0);
151: }
153: int MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
154: {
155: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
156: IS is1,is2;
157: int *vary,*iary,*irow,*icol,nrows,ncols,i,ierr,bs=a->bs,count;
160: ISGetIndices(isrow,&irow);
161: ISGetIndices(iscol,&icol);
162: ISGetLocalSize(isrow,&nrows);
163: ISGetLocalSize(iscol,&ncols);
164:
165: /* Verify if the indices corespond to each element in a block
166: and form the IS with compressed IS */
167: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
168: iary = vary + a->mbs;
169: PetscMemzero(vary,(a->mbs)*sizeof(int));
170: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
171: count = 0;
172: for (i=0; i<a->mbs; i++) {
173: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
174: if (vary[i]==bs) iary[count++] = i;
175: }
176: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
177:
178: PetscMemzero(vary,(a->mbs)*sizeof(int));
179: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
180: count = 0;
181: for (i=0; i<a->mbs; i++) {
182: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Internal error in PETSc");
183: if (vary[i]==bs) iary[count++] = i;
184: }
185: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
186: ISRestoreIndices(isrow,&irow);
187: ISRestoreIndices(iscol,&icol);
188: PetscFree(vary);
190: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,cs,scall,B);
191: ISDestroy(is1);
192: ISDestroy(is2);
193: return(0);
194: }
196: int MatGetSubMatrices_SeqBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
197: {
198: int ierr,i;
201: if (scall == MAT_INITIAL_MATRIX) {
202: PetscMalloc((n+1)*sizeof(Mat),B);
203: }
205: for (i=0; i<n; i++) {
206: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
207: }
208: return(0);
209: }
212: /* -------------------------------------------------------*/
213: /* Should check that shapes of vectors and matrices match */
214: /* -------------------------------------------------------*/
215: #include petscblaslapack.h
217: int MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
218: {
219: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
220: PetscScalar *x,*z,sum;
221: MatScalar *v;
222: int mbs=a->mbs,i,*idx,*ii,n,ierr;
225: VecGetArray(xx,&x);
226: VecGetArray(zz,&z);
228: idx = a->j;
229: v = a->a;
230: ii = a->i;
232: for (i=0; i<mbs; i++) {
233: n = ii[1] - ii[0]; ii++;
234: sum = 0.0;
235: while (n--) sum += *v++ * x[*idx++];
236: z[i] = sum;
237: }
238: VecRestoreArray(xx,&x);
239: VecRestoreArray(zz,&z);
240: PetscLogFlops(2*a->nz - A->m);
241: return(0);
242: }
244: int MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
245: {
246: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
247: PetscScalar *x,*z,*xb,sum1,sum2;
248: PetscScalar x1,x2;
249: MatScalar *v;
250: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
253: VecGetArray(xx,&x);
254: VecGetArray(zz,&z);
256: idx = a->j;
257: v = a->a;
258: ii = a->i;
260: for (i=0; i<mbs; i++) {
261: n = ii[1] - ii[0]; ii++;
262: sum1 = 0.0; sum2 = 0.0;
263: for (j=0; j<n; j++) {
264: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
265: sum1 += v[0]*x1 + v[2]*x2;
266: sum2 += v[1]*x1 + v[3]*x2;
267: v += 4;
268: }
269: z[0] = sum1; z[1] = sum2;
270: z += 2;
271: }
272: VecRestoreArray(xx,&x);
273: VecRestoreArray(zz,&z);
274: PetscLogFlops(8*a->nz - A->m);
275: return(0);
276: }
278: int MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
279: {
280: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
281: PetscScalar *x,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
282: MatScalar *v;
283: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
285: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
286: #pragma disjoint(*v,*z,*xb)
287: #endif
290: VecGetArray(xx,&x);
291: VecGetArray(zz,&z);
293: idx = a->j;
294: v = a->a;
295: ii = a->i;
297: for (i=0; i<mbs; i++) {
298: n = ii[1] - ii[0]; ii++;
299: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
300: for (j=0; j<n; j++) {
301: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
302: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
303: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
304: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
305: v += 9;
306: }
307: z[0] = sum1; z[1] = sum2; z[2] = sum3;
308: z += 3;
309: }
310: VecRestoreArray(xx,&x);
311: VecRestoreArray(zz,&z);
312: PetscLogFlops(18*a->nz - A->m);
313: return(0);
314: }
316: int MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
317: {
318: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
319: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
320: MatScalar *v;
321: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
324: VecGetArray(xx,&x);
325: VecGetArray(zz,&z);
327: idx = a->j;
328: v = a->a;
329: ii = a->i;
331: for (i=0; i<mbs; i++) {
332: n = ii[1] - ii[0]; ii++;
333: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
334: for (j=0; j<n; j++) {
335: xb = x + 4*(*idx++);
336: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
337: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
338: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
339: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
340: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
341: v += 16;
342: }
343: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
344: z += 4;
345: }
346: VecRestoreArray(xx,&x);
347: VecRestoreArray(zz,&z);
348: PetscLogFlops(32*a->nz - A->m);
349: return(0);
350: }
352: int MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
353: {
354: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
355: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*xb,*z,*x;
356: MatScalar *v;
357: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
360: VecGetArray(xx,&x);
361: VecGetArray(zz,&z);
363: idx = a->j;
364: v = a->a;
365: ii = a->i;
367: for (i=0; i<mbs; i++) {
368: n = ii[1] - ii[0]; ii++;
369: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
370: for (j=0; j<n; j++) {
371: xb = x + 5*(*idx++);
372: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
373: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
374: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
375: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
376: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
377: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
378: v += 25;
379: }
380: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
381: z += 5;
382: }
383: VecRestoreArray(xx,&x);
384: VecRestoreArray(zz,&z);
385: PetscLogFlops(50*a->nz - A->m);
386: return(0);
387: }
390: int MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
391: {
392: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
393: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
394: PetscScalar x1,x2,x3,x4,x5,x6;
395: MatScalar *v;
396: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
399: VecGetArray(xx,&x);
400: VecGetArray(zz,&z);
402: idx = a->j;
403: v = a->a;
404: ii = a->i;
406: for (i=0; i<mbs; i++) {
407: n = ii[1] - ii[0]; ii++;
408: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
409: for (j=0; j<n; j++) {
410: xb = x + 6*(*idx++);
411: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
412: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
413: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
414: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
415: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
416: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
417: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
418: v += 36;
419: }
420: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
421: z += 6;
422: }
424: VecRestoreArray(xx,&x);
425: VecRestoreArray(zz,&z);
426: PetscLogFlops(72*a->nz - A->m);
427: return(0);
428: }
429: int MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
430: {
431: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
432: PetscScalar *x,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
433: PetscScalar x1,x2,x3,x4,x5,x6,x7;
434: MatScalar *v;
435: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
438: VecGetArray(xx,&x);
439: VecGetArray(zz,&z);
441: idx = a->j;
442: v = a->a;
443: ii = a->i;
445: for (i=0; i<mbs; i++) {
446: n = ii[1] - ii[0]; ii++;
447: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
448: for (j=0; j<n; j++) {
449: xb = x + 7*(*idx++);
450: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
451: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
452: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
453: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
454: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
455: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
456: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
457: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
458: v += 49;
459: }
460: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
461: z += 7;
462: }
464: VecRestoreArray(xx,&x);
465: VecRestoreArray(zz,&z);
466: PetscLogFlops(98*a->nz - A->m);
467: return(0);
468: }
470: /*
471: This will not work with MatScalar == float because it calls the BLAS
472: */
473: int MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
474: {
475: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
476: PetscScalar *x,*z,*xb,*work,*workt;
477: MatScalar *v;
478: int ierr,mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2;
479: int ncols,k;
482: VecGetArray(xx,&x);
483: VecGetArray(zz,&z);
485: idx = a->j;
486: v = a->a;
487: ii = a->i;
490: if (!a->mult_work) {
491: k = PetscMax(A->m,A->n);
492: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
493: }
494: work = a->mult_work;
495: for (i=0; i<mbs; i++) {
496: n = ii[1] - ii[0]; ii++;
497: ncols = n*bs;
498: workt = work;
499: for (j=0; j<n; j++) {
500: xb = x + bs*(*idx++);
501: for (k=0; k<bs; k++) workt[k] = xb[k];
502: workt += bs;
503: }
504: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
505: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
506: v += n*bs2;
507: z += bs;
508: }
509: VecRestoreArray(xx,&x);
510: VecRestoreArray(zz,&z);
511: PetscLogFlops(2*a->nz*bs2 - A->m);
512: return(0);
513: }
515: int MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
516: {
517: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
518: PetscScalar *x,*y,*z,sum;
519: MatScalar *v;
520: int ierr,mbs=a->mbs,i,*idx,*ii,n;
523: VecGetArray(xx,&x);
524: VecGetArray(yy,&y);
525: if (zz != yy) {
526: VecGetArray(zz,&z);
527: } else {
528: z = y;
529: }
531: idx = a->j;
532: v = a->a;
533: ii = a->i;
535: for (i=0; i<mbs; i++) {
536: n = ii[1] - ii[0]; ii++;
537: sum = y[i];
538: while (n--) sum += *v++ * x[*idx++];
539: z[i] = sum;
540: }
541: VecRestoreArray(xx,&x);
542: VecRestoreArray(yy,&y);
543: if (zz != yy) {
544: VecRestoreArray(zz,&z);
545: }
546: PetscLogFlops(2*a->nz);
547: return(0);
548: }
550: int MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
551: {
552: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
553: PetscScalar *x,*y,*z,*xb,sum1,sum2;
554: PetscScalar x1,x2;
555: MatScalar *v;
556: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
559: VecGetArray(xx,&x);
560: VecGetArray(yy,&y);
561: if (zz != yy) {
562: VecGetArray(zz,&z);
563: } else {
564: z = y;
565: }
567: idx = a->j;
568: v = a->a;
569: ii = a->i;
571: for (i=0; i<mbs; i++) {
572: n = ii[1] - ii[0]; ii++;
573: sum1 = y[0]; sum2 = y[1];
574: for (j=0; j<n; j++) {
575: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
576: sum1 += v[0]*x1 + v[2]*x2;
577: sum2 += v[1]*x1 + v[3]*x2;
578: v += 4;
579: }
580: z[0] = sum1; z[1] = sum2;
581: z += 2; y += 2;
582: }
583: VecRestoreArray(xx,&x);
584: VecRestoreArray(yy,&y);
585: if (zz != yy) {
586: VecRestoreArray(zz,&z);
587: }
588: PetscLogFlops(4*a->nz);
589: return(0);
590: }
592: int MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
593: {
594: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
595: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,x1,x2,x3;
596: MatScalar *v;
597: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
600: VecGetArray(xx,&x);
601: VecGetArray(yy,&y);
602: if (zz != yy) {
603: VecGetArray(zz,&z);
604: } else {
605: z = y;
606: }
608: idx = a->j;
609: v = a->a;
610: ii = a->i;
612: for (i=0; i<mbs; i++) {
613: n = ii[1] - ii[0]; ii++;
614: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
615: for (j=0; j<n; j++) {
616: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
617: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
618: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
619: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
620: v += 9;
621: }
622: z[0] = sum1; z[1] = sum2; z[2] = sum3;
623: z += 3; y += 3;
624: }
625: VecRestoreArray(xx,&x);
626: VecRestoreArray(yy,&y);
627: if (zz != yy) {
628: VecRestoreArray(zz,&z);
629: }
630: PetscLogFlops(18*a->nz);
631: return(0);
632: }
634: int MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
635: {
636: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
637: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
638: MatScalar *v;
639: int ierr,mbs=a->mbs,i,*idx,*ii;
640: int j,n;
643: VecGetArray(xx,&x);
644: VecGetArray(yy,&y);
645: if (zz != yy) {
646: VecGetArray(zz,&z);
647: } else {
648: z = y;
649: }
651: idx = a->j;
652: v = a->a;
653: ii = a->i;
655: for (i=0; i<mbs; i++) {
656: n = ii[1] - ii[0]; ii++;
657: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
658: for (j=0; j<n; j++) {
659: xb = x + 4*(*idx++);
660: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
661: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
662: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
663: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
664: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
665: v += 16;
666: }
667: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
668: z += 4; y += 4;
669: }
670: VecRestoreArray(xx,&x);
671: VecRestoreArray(yy,&y);
672: if (zz != yy) {
673: VecRestoreArray(zz,&z);
674: }
675: PetscLogFlops(32*a->nz);
676: return(0);
677: }
679: int MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
680: {
681: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
682: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
683: MatScalar *v;
684: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
687: VecGetArray(xx,&x);
688: VecGetArray(yy,&y);
689: if (zz != yy) {
690: VecGetArray(zz,&z);
691: } else {
692: z = y;
693: }
695: idx = a->j;
696: v = a->a;
697: ii = a->i;
699: for (i=0; i<mbs; i++) {
700: n = ii[1] - ii[0]; ii++;
701: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
702: for (j=0; j<n; j++) {
703: xb = x + 5*(*idx++);
704: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
705: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
706: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
707: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
708: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
709: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
710: v += 25;
711: }
712: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
713: z += 5; y += 5;
714: }
715: VecRestoreArray(xx,&x);
716: VecRestoreArray(yy,&y);
717: if (zz != yy) {
718: VecRestoreArray(zz,&z);
719: }
720: PetscLogFlops(50*a->nz);
721: return(0);
722: }
723: int MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
724: {
725: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
726: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
727: PetscScalar x1,x2,x3,x4,x5,x6;
728: MatScalar *v;
729: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
732: VecGetArray(xx,&x);
733: VecGetArray(yy,&y);
734: if (zz != yy) {
735: VecGetArray(zz,&z);
736: } else {
737: z = y;
738: }
740: idx = a->j;
741: v = a->a;
742: ii = a->i;
744: for (i=0; i<mbs; i++) {
745: n = ii[1] - ii[0]; ii++;
746: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
747: for (j=0; j<n; j++) {
748: xb = x + 6*(*idx++);
749: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
750: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
751: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
752: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
753: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
754: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
755: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
756: v += 36;
757: }
758: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
759: z += 6; y += 6;
760: }
761: VecRestoreArray(xx,&x);
762: VecRestoreArray(yy,&y);
763: if (zz != yy) {
764: VecRestoreArray(zz,&z);
765: }
766: PetscLogFlops(72*a->nz);
767: return(0);
768: }
770: int MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
771: {
772: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
773: PetscScalar *x,*y,*z,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
774: PetscScalar x1,x2,x3,x4,x5,x6,x7;
775: MatScalar *v;
776: int ierr,mbs=a->mbs,i,*idx,*ii,j,n;
779: VecGetArray(xx,&x);
780: VecGetArray(yy,&y);
781: if (zz != yy) {
782: VecGetArray(zz,&z);
783: } else {
784: z = y;
785: }
787: idx = a->j;
788: v = a->a;
789: ii = a->i;
791: for (i=0; i<mbs; i++) {
792: n = ii[1] - ii[0]; ii++;
793: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
794: for (j=0; j<n; j++) {
795: xb = x + 7*(*idx++);
796: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
797: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
798: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
799: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
800: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
801: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
802: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
803: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
804: v += 49;
805: }
806: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
807: z += 7; y += 7;
808: }
809: VecRestoreArray(xx,&x);
810: VecRestoreArray(yy,&y);
811: if (zz != yy) {
812: VecRestoreArray(zz,&z);
813: }
814: PetscLogFlops(98*a->nz);
815: return(0);
816: }
818: int MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
819: {
820: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
821: PetscScalar *x,*z,*xb,*work,*workt;
822: MatScalar *v;
823: int mbs=a->mbs,i,*idx,*ii,bs=a->bs,j,n,bs2=a->bs2,ierr;
824: int ncols,k;
827: if (xx != yy) { VecCopy(yy,zz); }
829: VecGetArray(xx,&x);
830: VecGetArray(zz,&z);
831:
832: idx = a->j;
833: v = a->a;
834: ii = a->i;
837: if (!a->mult_work) {
838: k = PetscMax(A->m,A->n);
839: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
840: }
841: work = a->mult_work;
842: for (i=0; i<mbs; i++) {
843: n = ii[1] - ii[0]; ii++;
844: ncols = n*bs;
845: workt = work;
846: for (j=0; j<n; j++) {
847: xb = x + bs*(*idx++);
848: for (k=0; k<bs; k++) workt[k] = xb[k];
849: workt += bs;
850: }
851: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
852: /* LAgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
853: v += n*bs2;
854: z += bs;
855: }
856: VecRestoreArray(xx,&x);
857: VecRestoreArray(zz,&z);
858: PetscLogFlops(2*a->nz*bs2);
859: return(0);
860: }
862: int MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
863: {
864: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
865: PetscScalar *xg,*zg,*zb,zero = 0.0;
866: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
867: MatScalar *v;
868: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval;
869: int bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
872: VecSet(&zero,zz);
873: VecGetArray(xx,&xg); x = xg;
874: VecGetArray(zz,&zg); z = zg;
876: idx = a->j;
877: v = a->a;
878: ii = a->i;
879: xb = x;
880: switch (bs) {
881: case 1:
882: for (i=0; i<mbs; i++) {
883: n = ii[1] - ii[0]; ii++;
884: x1 = xb[0];
885: ib = idx + ai[i];
886: for (j=0; j<n; j++) {
887: rval = ib[j];
888: z[rval] += *v * x1;
889: v++;
890: }
891: xb++;
892: }
893: break;
894: case 2:
895: for (i=0; i<mbs; i++) {
896: n = ii[1] - ii[0]; ii++;
897: x1 = xb[0]; x2 = xb[1];
898: ib = idx + ai[i];
899: for (j=0; j<n; j++) {
900: rval = ib[j]*2;
901: z[rval++] += v[0]*x1 + v[1]*x2;
902: z[rval] += v[2]*x1 + v[3]*x2;
903: v += 4;
904: }
905: xb += 2;
906: }
907: break;
908: case 3:
909: for (i=0; i<mbs; i++) {
910: n = ii[1] - ii[0]; ii++;
911: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
912: ib = idx + ai[i];
913: for (j=0; j<n; j++) {
914: rval = ib[j]*3;
915: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
916: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
917: z[rval] += v[6]*x1 + v[7]*x2 + v[8]*x3;
918: v += 9;
919: }
920: xb += 3;
921: }
922: break;
923: case 4:
924: for (i=0; i<mbs; i++) {
925: n = ii[1] - ii[0]; ii++;
926: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
927: ib = idx + ai[i];
928: for (j=0; j<n; j++) {
929: rval = ib[j]*4;
930: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
931: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
932: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
933: z[rval] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
934: v += 16;
935: }
936: xb += 4;
937: }
938: break;
939: case 5:
940: for (i=0; i<mbs; i++) {
941: n = ii[1] - ii[0]; ii++;
942: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
943: x4 = xb[3]; x5 = xb[4];
944: ib = idx + ai[i];
945: for (j=0; j<n; j++) {
946: rval = ib[j]*5;
947: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
948: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
949: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
950: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
951: z[rval] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
952: v += 25;
953: }
954: xb += 5;
955: }
956: break;
957: case 6:
958: for (i=0; i<mbs; i++) {
959: n = ii[1] - ii[0]; ii++;
960: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
961: x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
962: ib = idx + ai[i];
963: for (j=0; j<n; j++) {
964: rval = ib[j]*6;
965: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
966: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
967: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
968: z[rval++] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
969: z[rval++] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
970: z[rval] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
971: v += 36;
972: }
973: xb += 6;
974: }
975: break;
976: case 7:
977: for (i=0; i<mbs; i++) {
978: n = ii[1] - ii[0]; ii++;
979: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
980: x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
981: ib = idx + ai[i];
982: for (j=0; j<n; j++) {
983: rval = ib[j]*7;
984: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
985: z[rval++] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
986: z[rval++] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
987: z[rval++] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
988: z[rval++] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
989: z[rval++] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
990: z[rval] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
991: v += 49;
992: }
993: xb += 7;
994: }
995: break;
996: default: { /* block sizes larger then 7 by 7 are handled by BLAS */
997: int ncols,k;
998: PetscScalar *work,*workt;
1000: if (!a->mult_work) {
1001: k = PetscMax(A->m,A->n);
1002: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1003: }
1004: work = a->mult_work;
1005: for (i=0; i<mbs; i++) {
1006: n = ii[1] - ii[0]; ii++;
1007: ncols = n*bs;
1008: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1009: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1010: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1011: v += n*bs2;
1012: x += bs;
1013: workt = work;
1014: for (j=0; j<n; j++) {
1015: zb = z + bs*(*idx++);
1016: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1017: workt += bs;
1018: }
1019: }
1020: }
1021: }
1022: VecRestoreArray(xx,&xg);
1023: VecRestoreArray(zz,&zg);
1024: PetscLogFlops(2*a->nz*a->bs2 - A->n);
1025: return(0);
1026: }
1028: int MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1030: {
1031: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1032: PetscScalar *xg,*zg,*zb,*x,*z,*xb,x1,x2,x3,x4,x5;
1033: MatScalar *v;
1034: int mbs=a->mbs,i,*idx,*ii,*ai=a->i,rval,bs=a->bs,j,n,bs2=a->bs2,*ib,ierr;
1037: if (yy != zz) { VecCopy(yy,zz); }
1038: VecGetArray(xx,&xg); x = xg;
1039: VecGetArray(zz,&zg); z = zg;
1042: idx = a->j;
1043: v = a->a;
1044: ii = a->i;
1045: xb = x;
1047: switch (bs) {
1048: case 1:
1049: for (i=0; i<mbs; i++) {
1050: n = ii[1] - ii[0]; ii++;
1051: x1 = xb[0];
1052: ib = idx + ai[i];
1053: for (j=0; j<n; j++) {
1054: rval = ib[j];
1055: z[rval] += *v * x1;
1056: v++;
1057: }
1058: xb++;
1059: }
1060: break;
1061: case 2:
1062: for (i=0; i<mbs; i++) {
1063: n = ii[1] - ii[0]; ii++;
1064: x1 = xb[0]; x2 = xb[1];
1065: ib = idx + ai[i];
1066: for (j=0; j<n; j++) {
1067: rval = ib[j]*2;
1068: z[rval++] += v[0]*x1 + v[1]*x2;
1069: z[rval++] += v[2]*x1 + v[3]*x2;
1070: v += 4;
1071: }
1072: xb += 2;
1073: }
1074: break;
1075: case 3:
1076: for (i=0; i<mbs; i++) {
1077: n = ii[1] - ii[0]; ii++;
1078: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1079: ib = idx + ai[i];
1080: for (j=0; j<n; j++) {
1081: rval = ib[j]*3;
1082: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1083: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1084: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1085: v += 9;
1086: }
1087: xb += 3;
1088: }
1089: break;
1090: case 4:
1091: for (i=0; i<mbs; i++) {
1092: n = ii[1] - ii[0]; ii++;
1093: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1094: ib = idx + ai[i];
1095: for (j=0; j<n; j++) {
1096: rval = ib[j]*4;
1097: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1098: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1099: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1100: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1101: v += 16;
1102: }
1103: xb += 4;
1104: }
1105: break;
1106: case 5:
1107: for (i=0; i<mbs; i++) {
1108: n = ii[1] - ii[0]; ii++;
1109: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1110: x4 = xb[3]; x5 = xb[4];
1111: ib = idx + ai[i];
1112: for (j=0; j<n; j++) {
1113: rval = ib[j]*5;
1114: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1115: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1116: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1117: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1118: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1119: v += 25;
1120: }
1121: xb += 5;
1122: }
1123: break;
1124: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1125: int ncols,k;
1126: PetscScalar *work,*workt;
1128: if (!a->mult_work) {
1129: k = PetscMax(A->m,A->n);
1130: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1131: }
1132: work = a->mult_work;
1133: for (i=0; i<mbs; i++) {
1134: n = ii[1] - ii[0]; ii++;
1135: ncols = n*bs;
1136: ierr = PetscMemzero(work,ncols*sizeof(PetscScalar));
1137: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,work);
1138: /* LAgemv_("T",&bs,&ncols,&_DOne,v,&bs,x,&_One,&_DOne,work,&_One); */
1139: v += n*bs2;
1140: x += bs;
1141: workt = work;
1142: for (j=0; j<n; j++) {
1143: zb = z + bs*(*idx++);
1144: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1145: workt += bs;
1146: }
1147: }
1148: }
1149: }
1150: VecRestoreArray(xx,&xg);
1151: VecRestoreArray(zz,&zg);
1152: PetscLogFlops(2*a->nz*a->bs2);
1153: return(0);
1154: }
1156: int MatScale_SeqBAIJ(PetscScalar *alpha,Mat inA)
1157: {
1158: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1159: int totalnz = a->bs2*a->nz;
1160: #if defined(PETSC_USE_MAT_SINGLE)
1161: int i;
1162: #else
1163: int one = 1;
1164: #endif
1167: #if defined(PETSC_USE_MAT_SINGLE)
1168: for (i=0; i<totalnz; i++) a->a[i] *= *alpha;
1169: #else
1170: BLscal_(&totalnz,alpha,a->a,&one);
1171: #endif
1172: PetscLogFlops(totalnz);
1173: return(0);
1174: }
1176: int MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1177: {
1178: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1179: MatScalar *v = a->a;
1180: PetscReal sum = 0.0;
1181: int i,j,k,bs = a->bs,nz=a->nz,bs2=a->bs2,k1;
1184: if (type == NORM_FROBENIUS) {
1185: for (i=0; i< bs2*nz; i++) {
1186: #if defined(PETSC_USE_COMPLEX)
1187: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1188: #else
1189: sum += (*v)*(*v); v++;
1190: #endif
1191: }
1192: *norm = sqrt(sum);
1193: } else if (type == NORM_INFINITY) { /* maximum row sum */
1194: *norm = 0.0;
1195: for (k=0; k<bs; k++) {
1196: for (j=0; j<a->mbs; j++) {
1197: v = a->a + bs2*a->i[j] + k;
1198: sum = 0.0;
1199: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1200: for (k1=0; k1<bs; k1++){
1201: sum += PetscAbsScalar(*v);
1202: v += bs;
1203: }
1204: }
1205: if (sum > *norm) *norm = sum;
1206: }
1207: }
1208: } else {
1209: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1210: }
1211: return(0);
1212: }
1215: int MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1216: {
1217: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1218: int ierr;
1219: PetscTruth flag;
1222: PetscTypeCompare((PetscObject)B,MATSEQBAIJ,&flag);
1223: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1225: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1226: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->nz != b->nz)) {
1227: *flg = PETSC_FALSE;
1228: return(0);
1229: }
1230:
1231: /* if the a->i are the same */
1232: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1233: if (*flg == PETSC_FALSE) {
1234: return(0);
1235: }
1236:
1237: /* if a->j are the same */
1238: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(int),flg);
1239: if (*flg == PETSC_FALSE) {
1240: return(0);
1241: }
1242: /* if a->a are the same */
1243: PetscMemcmp(a->a,b->a,(a->nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1244: return(0);
1245:
1246: }
1248: int MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1249: {
1250: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1251: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1252: PetscScalar *x,zero = 0.0;
1253: MatScalar *aa,*aa_j;
1256: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1257: bs = a->bs;
1258: aa = a->a;
1259: ai = a->i;
1260: aj = a->j;
1261: ambs = a->mbs;
1262: bs2 = a->bs2;
1264: VecSet(&zero,v);
1265: VecGetArray(v,&x);
1266: VecGetLocalSize(v,&n);
1267: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1268: for (i=0; i<ambs; i++) {
1269: for (j=ai[i]; j<ai[i+1]; j++) {
1270: if (aj[j] == i) {
1271: row = i*bs;
1272: aa_j = aa+j*bs2;
1273: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1274: break;
1275: }
1276: }
1277: }
1278: VecRestoreArray(v,&x);
1279: return(0);
1280: }
1282: int MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
1283: {
1284: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1285: PetscScalar *l,*r,x,*li,*ri;
1286: MatScalar *aa,*v;
1287: int ierr,i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
1290: ai = a->i;
1291: aj = a->j;
1292: aa = a->a;
1293: m = A->m;
1294: n = A->n;
1295: bs = a->bs;
1296: mbs = a->mbs;
1297: bs2 = a->bs2;
1298: if (ll) {
1299: VecGetArray(ll,&l);
1300: VecGetLocalSize(ll,&lm);
1301: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1302: for (i=0; i<mbs; i++) { /* for each block row */
1303: M = ai[i+1] - ai[i];
1304: li = l + i*bs;
1305: v = aa + bs2*ai[i];
1306: for (j=0; j<M; j++) { /* for each block */
1307: for (k=0; k<bs2; k++) {
1308: (*v++) *= li[k%bs];
1309: }
1310: }
1311: }
1312: VecRestoreArray(ll,&l);
1313: PetscLogFlops(a->nz);
1314: }
1315:
1316: if (rr) {
1317: VecGetArray(rr,&r);
1318: VecGetLocalSize(rr,&rn);
1319: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1320: for (i=0; i<mbs; i++) { /* for each block row */
1321: M = ai[i+1] - ai[i];
1322: v = aa + bs2*ai[i];
1323: for (j=0; j<M; j++) { /* for each block */
1324: ri = r + bs*aj[ai[i]+j];
1325: for (k=0; k<bs; k++) {
1326: x = ri[k];
1327: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1328: }
1329: }
1330: }
1331: VecRestoreArray(rr,&r);
1332: PetscLogFlops(a->nz);
1333: }
1334: return(0);
1335: }
1338: int MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1339: {
1340: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1343: info->rows_global = (double)A->m;
1344: info->columns_global = (double)A->n;
1345: info->rows_local = (double)A->m;
1346: info->columns_local = (double)A->n;
1347: info->block_size = a->bs2;
1348: info->nz_allocated = a->maxnz;
1349: info->nz_used = a->bs2*a->nz;
1350: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1351: info->assemblies = A->num_ass;
1352: info->mallocs = a->reallocs;
1353: info->memory = A->mem;
1354: if (A->factor) {
1355: info->fill_ratio_given = A->info.fill_ratio_given;
1356: info->fill_ratio_needed = A->info.fill_ratio_needed;
1357: info->factor_mallocs = A->info.factor_mallocs;
1358: } else {
1359: info->fill_ratio_given = 0;
1360: info->fill_ratio_needed = 0;
1361: info->factor_mallocs = 0;
1362: }
1363: return(0);
1364: }
1367: int MatZeroEntries_SeqBAIJ(Mat A)
1368: {
1369: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1370: int ierr;
1373: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1374: return(0);
1375: }