Actual source code: sbaij2.c
1: /*$Id: sbaij2.c,v 1.29 2001/03/23 23:22:21 balay Exp $*/
3: #include petscsys.h
4: #include src/mat/impls/baij/seq/baij.h
5: #include src/vec/vecimpl.h
6: #include src/inline/spops.h
7: #include src/inline/ilu.h
8: #include petscbt.h
9: #include sbaij.h
11: int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS *is,int ov)
12: {
14: SETERRQ(1,"Function not yet written for SBAIJ format");
15: /* return(0); */
16: }
18: int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
19: {
20: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
21: int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
22: int row,mat_i,*mat_j,tcol,*mat_ilen;
23: int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
24: int *aj = a->j,*ai = a->i;
25: MatScalar *mat_a;
26: Mat C;
27: PetscTruth flag;
30:
31: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
32: ISSorted(iscol,(PetscTruth*)&i);
33: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
35: ISGetIndices(isrow,&irow);
36: ISGetSize(isrow,&nrows);
37:
38: ierr = PetscMalloc((1+oldcols)*sizeof(int),&smap);
39: ssmap = smap;
40: ierr = PetscMalloc((1+nrows)*sizeof(int),&lens);
41: ierr = PetscMemzero(smap,oldcols*sizeof(int));
42: for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
43: /* determine lens of each row */
44: for (i=0; i<nrows; i++) {
45: kstart = ai[irow[i]];
46: kend = kstart + a->ilen[irow[i]];
47: lens[i] = 0;
48: for (k=kstart; k<kend; k++) {
49: if (ssmap[aj[k]]) {
50: lens[i]++;
51: }
52: }
53: }
54: /* Create and fill new matrix */
55: if (scall == MAT_REUSE_MATRIX) {
56: c = (Mat_SeqSBAIJ *)((*B)->data);
58: if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
59: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
60: if (flag == PETSC_FALSE) {
61: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
62: }
63: PetscMemzero(c->ilen,c->mbs*sizeof(int));
64: C = *B;
65: } else {
66: MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);
67: }
68: c = (Mat_SeqSBAIJ *)(C->data);
69: for (i=0; i<nrows; i++) {
70: row = irow[i];
71: kstart = ai[row];
72: kend = kstart + a->ilen[row];
73: mat_i = c->i[i];
74: mat_j = c->j + mat_i;
75: mat_a = c->a + mat_i*bs2;
76: mat_ilen = c->ilen + i;
77: for (k=kstart; k<kend; k++) {
78: if ((tcol=ssmap[a->j[k]])) {
79: *mat_j++ = tcol - 1;
80: ierr = PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
81: mat_a += bs2;
82: (*mat_ilen)++;
83: }
84: }
85: }
86:
87: /* Free work space */
88: PetscFree(smap);
89: PetscFree(lens);
90: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
91: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
92:
93: ISRestoreIndices(isrow,&irow);
94: *B = C;
95: return(0);
96: }
98: int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
99: {
100: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
101: IS is1;
102: int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
105: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
106:
107: ISGetIndices(isrow,&irow);
108: ISGetSize(isrow,&nrows);
109:
110: /* Verify if the indices corespond to each element in a block
111: and form the IS with compressed IS */
112: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
113: iary = vary + a->mbs;
114: PetscMemzero(vary,(a->mbs)*sizeof(int));
115: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
116:
117: count = 0;
118: for (i=0; i<a->mbs; i++) {
119: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
120: if (vary[i]==bs) iary[count++] = i;
121: }
122: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
123:
124: ISRestoreIndices(isrow,&irow);
125: PetscFree(vary);
127: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);
128: ISDestroy(is1);
129: return(0);
130: }
132: int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,IS *irow,IS *icol,MatReuse scall,Mat **B)
133: {
134: int ierr,i;
137: if (scall == MAT_INITIAL_MATRIX) {
138: PetscMalloc((n+1)*sizeof(Mat),B);
139: }
141: for (i=0; i<n; i++) {
142: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
143: }
144: return(0);
145: }
147: /* -------------------------------------------------------*/
148: /* Should check that shapes of vectors and matrices match */
149: /* -------------------------------------------------------*/
150: #include petscblaslapack.h
152: int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
153: {
154: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
155: Scalar *x,*z,*xb,x1,zero=0.0;
156: MatScalar *v;
157: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
160: VecSet(&zero,zz);
161: VecGetArray(xx,&x);
162: VecGetArray(zz,&z);
164: v = a->a;
165: xb = x;
166:
167: for (i=0; i<mbs; i++) {
168: n = ai[1] - ai[0]; /* length of i_th row of A */
169: x1 = xb[0];
170: ib = aj + *ai;
171: jmin = 0;
172: if (*ib == i) { /* (diag of A)*x */
173: z[i] += *v++ * x[*ib++];
174: jmin++;
175: }
176: for (j=jmin; j<n; j++) {
177: cval = *ib;
178: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
179: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
180: }
181: xb++; ai++;
182: }
184: VecRestoreArray(xx,&x);
185: VecRestoreArray(zz,&z);
186: PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m); /* s_nz = (nz+m)/2 */
187: return(0);
188: }
190: int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
191: {
192: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
193: Scalar *x,*z,*xb,x1,x2,zero=0.0;
194: MatScalar *v;
195: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
199: VecSet(&zero,zz);
200: VecGetArray(xx,&x);
201: VecGetArray(zz,&z);
202:
203: v = a->a;
204: xb = x;
206: for (i=0; i<mbs; i++) {
207: n = ai[1] - ai[0]; /* length of i_th block row of A */
208: x1 = xb[0]; x2 = xb[1];
209: ib = aj + *ai;
210: jmin = 0;
211: if (*ib == i){ /* (diag of A)*x */
212: z[2*i] += v[0]*x1 + v[2]*x2;
213: z[2*i+1] += v[2]*x1 + v[3]*x2;
214: v += 4; jmin++;
215: }
216: for (j=jmin; j<n; j++) {
217: /* (strict lower triangular part of A)*x */
218: cval = ib[j]*2;
219: z[cval] += v[0]*x1 + v[1]*x2;
220: z[cval+1] += v[2]*x1 + v[3]*x2;
221: /* (strict upper triangular part of A)*x */
222: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
223: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
224: v += 4;
225: }
226: xb +=2; ai++;
227: }
229: VecRestoreArray(xx,&x);
230: VecRestoreArray(zz,&z);
231: PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
232: return(0);
233: }
235: int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
236: {
237: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
238: Scalar *x,*z,*xb,x1,x2,x3,zero=0.0;
239: MatScalar *v;
240: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
244: VecSet(&zero,zz);
245: VecGetArray(xx,&x);
246: VecGetArray(zz,&z);
247:
248: v = a->a;
249: xb = x;
251: for (i=0; i<mbs; i++) {
252: n = ai[1] - ai[0]; /* length of i_th block row of A */
253: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
254: ib = aj + *ai;
255: jmin = 0;
256: if (*ib == i){ /* (diag of A)*x */
257: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
258: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
259: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
260: v += 9; jmin++;
261: }
262: for (j=jmin; j<n; j++) {
263: /* (strict lower triangular part of A)*x */
264: cval = ib[j]*3;
265: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
266: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
267: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
268: /* (strict upper triangular part of A)*x */
269: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
270: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
271: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
272: v += 9;
273: }
274: xb +=3; ai++;
275: }
277: VecRestoreArray(xx,&x);
278: VecRestoreArray(zz,&z);
279: PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
280: return(0);
281: }
283: int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
284: {
285: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
286: Scalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
287: MatScalar *v;
288: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
291: VecSet(&zero,zz);
292: VecGetArray(xx,&x);
293: VecGetArray(zz,&z);
294:
295: v = a->a;
296: xb = x;
298: for (i=0; i<mbs; i++) {
299: n = ai[1] - ai[0]; /* length of i_th block row of A */
300: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
301: ib = aj + *ai;
302: jmin = 0;
303: if (*ib == i){ /* (diag of A)*x */
304: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
305: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
306: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
307: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
308: v += 16; jmin++;
309: }
310: for (j=jmin; j<n; j++) {
311: /* (strict lower triangular part of A)*x */
312: cval = ib[j]*4;
313: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
314: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
315: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
316: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
317: /* (strict upper triangular part of A)*x */
318: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
319: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
320: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
321: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
322: v += 16;
323: }
324: xb +=4; ai++;
325: }
327: VecRestoreArray(xx,&x);
328: VecRestoreArray(zz,&z);
329: PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
330: return(0);
331: }
333: int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
334: {
335: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
336: Scalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
337: MatScalar *v;
338: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
341: VecSet(&zero,zz);
342: VecGetArray(xx,&x);
343: VecGetArray(zz,&z);
344:
345: v = a->a;
346: xb = x;
348: for (i=0; i<mbs; i++) {
349: n = ai[1] - ai[0]; /* length of i_th block row of A */
350: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
351: ib = aj + *ai;
352: jmin = 0;
353: if (*ib == i){ /* (diag of A)*x */
354: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
355: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
356: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
357: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
358: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
359: v += 25; jmin++;
360: }
361: for (j=jmin; j<n; j++) {
362: /* (strict lower triangular part of A)*x */
363: cval = ib[j]*5;
364: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
365: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
366: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
367: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
368: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
369: /* (strict upper triangular part of A)*x */
370: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
371: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
372: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
373: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
374: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
375: v += 25;
376: }
377: xb +=5; ai++;
378: }
380: VecRestoreArray(xx,&x);
381: VecRestoreArray(zz,&z);
382: PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
383: return(0);
384: }
387: int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
388: {
389: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
390: Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
391: MatScalar *v;
392: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
395: VecSet(&zero,zz);
396: VecGetArray(xx,&x);
397: VecGetArray(zz,&z);
398:
399: v = a->a;
400: xb = x;
402: for (i=0; i<mbs; i++) {
403: n = ai[1] - ai[0]; /* length of i_th block row of A */
404: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
405: ib = aj + *ai;
406: jmin = 0;
407: if (*ib == i){ /* (diag of A)*x */
408: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
409: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
410: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
411: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
412: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
413: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
414: v += 36; jmin++;
415: }
416: for (j=jmin; j<n; j++) {
417: /* (strict lower triangular part of A)*x */
418: cval = ib[j]*6;
419: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
420: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
421: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
422: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
423: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
424: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
425: /* (strict upper triangular part of A)*x */
426: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
427: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
428: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
429: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
430: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
431: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
432: v += 36;
433: }
434: xb +=6; ai++;
435: }
437: VecRestoreArray(xx,&x);
438: VecRestoreArray(zz,&z);
439: PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
440: return(0);
441: }
442: int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
443: {
444: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
445: Scalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
446: MatScalar *v;
447: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
450: VecSet(&zero,zz);
451: VecGetArray(xx,&x);
452: VecGetArray(zz,&z);
453:
454: v = a->a;
455: xb = x;
457: for (i=0; i<mbs; i++) {
458: n = ai[1] - ai[0]; /* length of i_th block row of A */
459: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
460: ib = aj + *ai;
461: jmin = 0;
462: if (*ib == i){ /* (diag of A)*x */
463: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
464: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
465: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
466: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
467: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
468: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
469: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
470: v += 49; jmin++;
471: }
472: for (j=jmin; j<n; j++) {
473: /* (strict lower triangular part of A)*x */
474: cval = ib[j]*7;
475: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
476: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
477: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
478: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
479: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
480: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
481: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
482: /* (strict upper triangular part of A)*x */
483: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
484: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
485: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
486: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
487: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
488: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
489: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
490: v += 49;
491: }
492: xb +=7; ai++;
493: }
494: VecRestoreArray(xx,&x);
495: VecRestoreArray(zz,&z);
496: PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
497: return(0);
498: }
500: /*
501: This will not work with MatScalar == float because it calls the BLAS
502: */
503: int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
504: {
505: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
506: Scalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
507: MatScalar *v;
508: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
509: int ncols,k;
512: VecSet(&zero,zz);
513: VecGetArray(xx,&x); x_ptr=x;
514: VecGetArray(zz,&z); z_ptr=z;
516: aj = a->j;
517: v = a->a;
518: ii = a->i;
520: if (!a->mult_work) {
521: PetscMalloc((A->m+1)*sizeof(Scalar),&a->mult_work);
522: }
523: work = a->mult_work;
524:
525: for (i=0; i<mbs; i++) {
526: n = ii[1] - ii[0]; ncols = n*bs;
527: workt = work; idx=aj+ii[0];
529: /* upper triangular part */
530: for (j=0; j<n; j++) {
531: xb = x_ptr + bs*(*idx++);
532: for (k=0; k<bs; k++) workt[k] = xb[k];
533: workt += bs;
534: }
535: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
536: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
537:
538: /* strict lower triangular part */
539: idx = aj+ii[0];
540: if (*idx == i){
541: ncols -= bs; v += bs2; idx++; n--;
542: }
543:
544: if (ncols > 0){
545: workt = work;
546: ierr = PetscMemzero(workt,ncols*sizeof(Scalar));
547: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
548: for (j=0; j<n; j++) {
549: zb = z_ptr + bs*(*idx++);
550: for (k=0; k<bs; k++) zb[k] += workt[k] ;
551: workt += bs;
552: }
553: }
554: x += bs; v += n*bs2; z += bs; ii++;
555: }
556:
557: VecRestoreArray(xx,&x);
558: VecRestoreArray(zz,&z);
559: PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
560: return(0);
561: }
563: int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
564: {
565: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
566: Scalar *x,*y,*z,*xb,x1;
567: MatScalar *v;
568: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
571: VecGetArray(xx,&x);
572: if (yy != xx) {
573: VecGetArray(yy,&y);
574: } else {
575: y = x;
576: }
577: if (zz != yy) {
578: VecCopy(yy,zz);
579: VecGetArray(zz,&z);
580: } else {
581: z = y;
582: }
584: v = a->a;
585: xb = x;
587: for (i=0; i<mbs; i++) {
588: n = ai[1] - ai[0]; /* length of i_th row of A */
589: x1 = xb[0];
590: ib = aj + *ai;
591: jmin = 0;
592: if (*ib == i) { /* (diag of A)*x */
593: z[i] += *v++ * x[*ib++]; jmin++;
594: }
595: for (j=jmin; j<n; j++) {
596: cval = *ib;
597: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
598: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
599: }
600: xb++; ai++;
601: }
603: VecRestoreArray(xx,&x);
604: if (yy != xx) VecRestoreArray(yy,&y);
605: if (zz != yy) VecRestoreArray(zz,&z);
606:
607: PetscLogFlops(2*(a->s_nz*2 - A->m));
608: return(0);
609: }
611: int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
612: {
613: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
614: Scalar *x,*y,*z,*xb,x1,x2;
615: MatScalar *v;
616: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
619: VecGetArray(xx,&x);
620: if (yy != xx) {
621: VecGetArray(yy,&y);
622: } else {
623: y = x;
624: }
625: if (zz != yy) {
626: VecCopy(yy,zz);
627: VecGetArray(zz,&z);
628: } else {
629: z = y;
630: }
632: v = a->a;
633: xb = x;
635: for (i=0; i<mbs; i++) {
636: n = ai[1] - ai[0]; /* length of i_th block row of A */
637: x1 = xb[0]; x2 = xb[1];
638: ib = aj + *ai;
639: jmin = 0;
640: if (*ib == i){ /* (diag of A)*x */
641: z[2*i] += v[0]*x1 + v[2]*x2;
642: z[2*i+1] += v[2]*x1 + v[3]*x2;
643: v += 4; jmin++;
644: }
645: for (j=jmin; j<n; j++) {
646: /* (strict lower triangular part of A)*x */
647: cval = ib[j]*2;
648: z[cval] += v[0]*x1 + v[1]*x2;
649: z[cval+1] += v[2]*x1 + v[3]*x2;
650: /* (strict upper triangular part of A)*x */
651: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
652: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
653: v += 4;
654: }
655: xb +=2; ai++;
656: }
658: VecRestoreArray(xx,&x);
659: if (yy != xx) VecRestoreArray(yy,&y);
660: if (zz != yy) VecRestoreArray(zz,&z);
662: PetscLogFlops(4*(a->s_nz*2 - A->m));
663: return(0);
664: }
666: int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
667: {
668: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
669: Scalar *x,*y,*z,*xb,x1,x2,x3;
670: MatScalar *v;
671: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
674: VecGetArray(xx,&x);
675: if (yy != xx) {
676: VecGetArray(yy,&y);
677: } else {
678: y = x;
679: }
680: if (zz != yy) {
681: VecCopy(yy,zz);
682: VecGetArray(zz,&z);
683: } else {
684: z = y;
685: }
687: v = a->a;
688: xb = x;
690: for (i=0; i<mbs; i++) {
691: n = ai[1] - ai[0]; /* length of i_th block row of A */
692: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
693: ib = aj + *ai;
694: jmin = 0;
695: if (*ib == i){ /* (diag of A)*x */
696: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
697: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
698: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
699: v += 9; jmin++;
700: }
701: for (j=jmin; j<n; j++) {
702: /* (strict lower triangular part of A)*x */
703: cval = ib[j]*3;
704: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
705: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
706: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
707: /* (strict upper triangular part of A)*x */
708: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
709: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
710: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
711: v += 9;
712: }
713: xb +=3; ai++;
714: }
716: VecRestoreArray(xx,&x);
717: if (yy != xx) VecRestoreArray(yy,&y);
718: if (zz != yy) VecRestoreArray(zz,&z);
720: PetscLogFlops(18*(a->s_nz*2 - A->m));
721: return(0);
722: }
724: int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
725: {
726: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
727: Scalar *x,*y,*z,*xb,x1,x2,x3,x4;
728: MatScalar *v;
729: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
732: VecGetArray(xx,&x);
733: if (yy != xx) {
734: VecGetArray(yy,&y);
735: } else {
736: y = x;
737: }
738: if (zz != yy) {
739: VecCopy(yy,zz);
740: VecGetArray(zz,&z);
741: } else {
742: z = y;
743: }
745: v = a->a;
746: xb = x;
748: for (i=0; i<mbs; i++) {
749: n = ai[1] - ai[0]; /* length of i_th block row of A */
750: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
751: ib = aj + *ai;
752: jmin = 0;
753: if (*ib == i){ /* (diag of A)*x */
754: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
755: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
756: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
757: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
758: v += 16; jmin++;
759: }
760: for (j=jmin; j<n; j++) {
761: /* (strict lower triangular part of A)*x */
762: cval = ib[j]*4;
763: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
764: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
765: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
766: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
767: /* (strict upper triangular part of A)*x */
768: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
769: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
770: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
771: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
772: v += 16;
773: }
774: xb +=4; ai++;
775: }
777: VecRestoreArray(xx,&x);
778: if (yy != xx) VecRestoreArray(yy,&y);
779: if (zz != yy) VecRestoreArray(zz,&z);
781: PetscLogFlops(32*(a->s_nz*2 - A->m));
782: return(0);
783: }
785: int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
786: {
787: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
788: Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5;
789: MatScalar *v;
790: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
793: VecGetArray(xx,&x);
794: if (yy != xx) {
795: VecGetArray(yy,&y);
796: } else {
797: y = x;
798: }
799: if (zz != yy) {
800: VecCopy(yy,zz);
801: VecGetArray(zz,&z);
802: } else {
803: z = y;
804: }
806: v = a->a;
807: xb = x;
809: for (i=0; i<mbs; i++) {
810: n = ai[1] - ai[0]; /* length of i_th block row of A */
811: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
812: ib = aj + *ai;
813: jmin = 0;
814: if (*ib == i){ /* (diag of A)*x */
815: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
816: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
817: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
818: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
819: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
820: v += 25; jmin++;
821: }
822: for (j=jmin; j<n; j++) {
823: /* (strict lower triangular part of A)*x */
824: cval = ib[j]*5;
825: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
826: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
827: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
828: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
829: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
830: /* (strict upper triangular part of A)*x */
831: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
832: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
833: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
834: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
835: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
836: v += 25;
837: }
838: xb +=5; ai++;
839: }
841: VecRestoreArray(xx,&x);
842: if (yy != xx) VecRestoreArray(yy,&y);
843: if (zz != yy) VecRestoreArray(zz,&z);
845: PetscLogFlops(50*(a->s_nz*2 - A->m));
846: return(0);
847: }
848: int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
849: {
850: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
851: Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
852: MatScalar *v;
853: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
856: VecGetArray(xx,&x);
857: if (yy != xx) {
858: VecGetArray(yy,&y);
859: } else {
860: y = x;
861: }
862: if (zz != yy) {
863: VecCopy(yy,zz);
864: VecGetArray(zz,&z);
865: } else {
866: z = y;
867: }
869: v = a->a;
870: xb = x;
872: for (i=0; i<mbs; i++) {
873: n = ai[1] - ai[0]; /* length of i_th block row of A */
874: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
875: ib = aj + *ai;
876: jmin = 0;
877: if (*ib == i){ /* (diag of A)*x */
878: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
879: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
880: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
881: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
882: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
883: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
884: v += 36; jmin++;
885: }
886: for (j=jmin; j<n; j++) {
887: /* (strict lower triangular part of A)*x */
888: cval = ib[j]*6;
889: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
890: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
891: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
892: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
893: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
894: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
895: /* (strict upper triangular part of A)*x */
896: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
897: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
898: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
899: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
900: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
901: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
902: v += 36;
903: }
904: xb +=6; ai++;
905: }
907: VecRestoreArray(xx,&x);
908: if (yy != xx) VecRestoreArray(yy,&y);
909: if (zz != yy) VecRestoreArray(zz,&z);
911: PetscLogFlops(72*(a->s_nz*2 - A->m));
912: return(0);
913: }
915: int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
916: {
917: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
918: Scalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
919: MatScalar *v;
920: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
923: VecGetArray(xx,&x);
924: if (yy != xx) {
925: VecGetArray(yy,&y);
926: } else {
927: y = x;
928: }
929: if (zz != yy) {
930: VecCopy(yy,zz);
931: VecGetArray(zz,&z);
932: } else {
933: z = y;
934: }
936: v = a->a;
937: xb = x;
939: for (i=0; i<mbs; i++) {
940: n = ai[1] - ai[0]; /* length of i_th block row of A */
941: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
942: ib = aj + *ai;
943: jmin = 0;
944: if (*ib == i){ /* (diag of A)*x */
945: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
946: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
947: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
948: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
949: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
950: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
951: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
952: v += 49; jmin++;
953: }
954: for (j=jmin; j<n; j++) {
955: /* (strict lower triangular part of A)*x */
956: cval = ib[j]*7;
957: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
958: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
959: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
960: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
961: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
962: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
963: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
964: /* (strict upper triangular part of A)*x */
965: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
966: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
967: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
968: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
969: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
970: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
971: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
972: v += 49;
973: }
974: xb +=7; ai++;
975: }
977: VecRestoreArray(xx,&x);
978: if (yy != xx) VecRestoreArray(yy,&y);
979: if (zz != yy) VecRestoreArray(zz,&z);
981: PetscLogFlops(98*(a->s_nz*2 - A->m));
982: return(0);
983: }
985: int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
986: {
987: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
988: Scalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
989: MatScalar *v;
990: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
991: int ncols,k;
994: VecGetArray(xx,&x); x_ptr=x;
995: if (yy != xx) {
996: VecGetArray(yy,&y);
997: } else {
998: y = x;
999: }
1000: if (zz != yy) {
1001: VecCopy(yy,zz);
1002: VecGetArray(zz,&z); z_ptr=z;
1003: } else {
1004: z = y;
1005: }
1007: aj = a->j;
1008: v = a->a;
1009: ii = a->i;
1011: if (!a->mult_work) {
1012: PetscMalloc((A->m+1)*sizeof(Scalar),&a->mult_work);
1013: }
1014: work = a->mult_work;
1015:
1016:
1017: for (i=0; i<mbs; i++) {
1018: n = ii[1] - ii[0]; ncols = n*bs;
1019: workt = work; idx=aj+ii[0];
1021: /* upper triangular part */
1022: for (j=0; j<n; j++) {
1023: xb = x_ptr + bs*(*idx++);
1024: for (k=0; k<bs; k++) workt[k] = xb[k];
1025: workt += bs;
1026: }
1027: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1028: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1030: /* strict lower triangular part */
1031: idx = aj+ii[0];
1032: if (*idx == i){
1033: ncols -= bs; v += bs2; idx++; n--;
1034: }
1035: if (ncols > 0){
1036: workt = work;
1037: ierr = PetscMemzero(workt,ncols*sizeof(Scalar));
1038: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1039: for (j=0; j<n; j++) {
1040: zb = z_ptr + bs*(*idx++);
1041: /* idx++; */
1042: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1043: workt += bs;
1044: }
1045: }
1047: x += bs; v += n*bs2; z += bs; ii++;
1048: }
1050: VecRestoreArray(xx,&x);
1051: if (yy != xx) VecRestoreArray(yy,&y);
1052: if (zz != yy) VecRestoreArray(zz,&z);
1054: PetscLogFlops(2*(a->s_nz*2 - A->m));
1055: return(0);
1056: }
1058: int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1059: {
1061: SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1062: /* return(0); */
1063: }
1065: int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1067: {
1069: SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1070: /* return(0); */
1071: }
1073: int MatScale_SeqSBAIJ(Scalar *alpha,Mat inA)
1074: {
1075: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1076: int one = 1,totalnz = a->bs2*a->s_nz;
1079: BLscal_(&totalnz,alpha,a->a,&one);
1080: PetscLogFlops(totalnz);
1081: return(0);
1082: }
1084: int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1085: {
1086: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1087: MatScalar *v = a->a;
1088: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1089: int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1090: int *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1091:
1093: if (type == NORM_FROBENIUS) {
1094: for (k=0; k<mbs; k++){
1095: jmin = a->i[k]; jmax = a->i[k+1];
1096: col = aj + jmin;
1097: if (*col == k){ /* diagonal block */
1098: for (i=0; i<bs2; i++){
1099: #if defined(PETSC_USE_COMPLEX)
1100: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1101: #else
1102: sum_diag += (*v)*(*v); v++;
1103: #endif
1104: }
1105: jmin++;
1106: }
1107: for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */
1108: for (i=0; i<bs2; i++){
1109: #if defined(PETSC_USE_COMPLEX)
1110: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1111: #else
1112: sum_off += (*v)*(*v); v++;
1113: #endif
1114: }
1115: }
1116: }
1117: *norm = sqrt(sum_diag + 2*sum_off);
1119: } else if (type == NORM_INFINITY) { /* maximum row sum */
1120: PetscMalloc(mbs*sizeof(int),&il);
1121: PetscMalloc(mbs*sizeof(int),&jl);
1122: PetscMalloc(bs*sizeof(PetscReal),&sum);
1123: for (i=0; i<mbs; i++) {
1124: jl[i] = mbs; il[0] = 0;
1125: }
1127: *norm = 0.0;
1128: for (k=0; k<mbs; k++) { /* k_th block row */
1129: for (j=0; j<bs; j++) sum[j]=0.0;
1131: /*-- col sum --*/
1132: i = jl[k]; /* first |A(i,k)| to be added */
1133: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1134: at step k */
1135: while (i<mbs){
1136: nexti = jl[i]; /* next block row to be added */
1137: ik = il[i]; /* block index of A(i,k) in the array a */
1138: for (j=0; j<bs; j++){
1139: v = a->a + ik*bs2 + j*bs;
1140: for (k1=0; k1<bs; k1++) {
1141: sum[j] += PetscAbsScalar(*v); v++;
1142: }
1143: }
1144: /* update il, jl */
1145: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1146: jmax = a->i[i+1];
1147: if (jmin < jmax){
1148: il[i] = jmin;
1149: j = a->j[jmin];
1150: jl[i] = jl[j]; jl[j]=i;
1151: }
1152: i = nexti;
1153: }
1154:
1155: /*-- row sum --*/
1156: jmin = a->i[k]; jmax = a->i[k+1];
1157: for (i=jmin; i<jmax; i++) {
1158: for (j=0; j<bs; j++){
1159: v = a->a + i*bs2 + j;
1160: for (k1=0; k1<bs; k1++){
1161: sum[j] += PetscAbsScalar(*v);
1162: v += bs;
1163: }
1164: }
1165: }
1166: /* add k_th block row to il, jl */
1167: col = aj+jmin;
1168: if (*col == k) jmin++;
1169: if (jmin < jmax){
1170: il[k] = jmin;
1171: j = a->j[jmin];
1172: jl[k] = jl[j]; jl[j] = k;
1173: }
1174: for (j=0; j<bs; j++){
1175: if (sum[j] > *norm) *norm = sum[j];
1176: }
1177: }
1178: PetscFree(il);
1179: PetscFree(jl);
1180: PetscFree(sum);
1181: } else {
1182: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1183: }
1184: return(0);
1185: }
1187: int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1188: {
1189: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1190: int ierr;
1191: PetscTruth flag;
1194: PetscTypeCompare((PetscObject)A,MATSEQBAIJ,&flag);
1195: if (!flag) SETERRQ(PETSC_ERR_ARG_INCOMP,"Matrices must be same type");
1197: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1198: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1199: *flg = PETSC_FALSE;
1200: return(0);
1201: }
1202:
1203: /* if the a->i are the same */
1204: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1205: if (*flg == PETSC_FALSE) {
1206: return(0);
1207: }
1208:
1209: /* if a->j are the same */
1210: PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);
1211: if (*flg == PETSC_FALSE) {
1212: return(0);
1213: }
1214: /* if a->a are the same */
1215: PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(Scalar),flg);
1216: return(0);
1217:
1218: }
1220: int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1221: {
1222: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1223: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1224: Scalar *x,zero = 0.0;
1225: MatScalar *aa,*aa_j;
1228: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1229: bs = a->bs;
1230: aa = a->a;
1231: ai = a->i;
1232: aj = a->j;
1233: ambs = a->mbs;
1234: bs2 = a->bs2;
1236: VecSet(&zero,v);
1237: VecGetArray(v,&x);
1238: VecGetLocalSize(v,&n);
1239: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1240: for (i=0; i<ambs; i++) {
1241: j=ai[i];
1242: if (aj[j] == i) { /* if this is a diagonal element */
1243: row = i*bs;
1244: aa_j = aa + j*bs2;
1245: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1246: }
1247: }
1248: VecRestoreArray(v,&x);
1249: return(0);
1250: }
1252: int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1253: {
1254: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1255: Scalar *l,*r,x,*li,*ri;
1256: MatScalar *aa,*v;
1257: int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1260: ai = a->i;
1261: aj = a->j;
1262: aa = a->a;
1263: m = A->m;
1264: bs = a->bs;
1265: mbs = a->mbs;
1266: bs2 = a->bs2;
1268: if (ll != rr) {
1269: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be samen");
1270: }
1271: if (ll) {
1272: VecGetArray(ll,&l);
1273: VecGetLocalSize(ll,&lm);
1274: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1275: for (i=0; i<mbs; i++) { /* for each block row */
1276: M = ai[i+1] - ai[i];
1277: li = l + i*bs;
1278: v = aa + bs2*ai[i];
1279: for (j=0; j<M; j++) { /* for each block */
1280: for (k=0; k<bs2; k++) {
1281: (*v++) *= li[k%bs];
1282: }
1283: #ifdef CONT
1284: /* will be used to replace the above loop */
1285: ri = l + bs*aj[ai[i]+j];
1286: for (k=0; k<bs; k++) { /* column value */
1287: x = ri[k];
1288: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1289: }
1290: #endif
1292: }
1293: }
1294: VecRestoreArray(ll,&l);
1295: PetscLogFlops(2*a->s_nz);
1296: }
1297: /* will be deleted */
1298: if (rr) {
1299: VecGetArray(rr,&r);
1300: VecGetLocalSize(rr,&rn);
1301: if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1302: for (i=0; i<mbs; i++) { /* for each block row */
1303: M = ai[i+1] - ai[i];
1304: v = aa + bs2*ai[i];
1305: for (j=0; j<M; j++) { /* for each block */
1306: ri = r + bs*aj[ai[i]+j];
1307: for (k=0; k<bs; k++) {
1308: x = ri[k];
1309: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1310: }
1311: }
1312: }
1313: VecRestoreArray(rr,&r);
1314: PetscLogFlops(a->s_nz);
1315: }
1316: return(0);
1317: }
1319: int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1320: {
1321: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1324: info->rows_global = (double)A->m;
1325: info->columns_global = (double)A->m;
1326: info->rows_local = (double)A->m;
1327: info->columns_local = (double)A->m;
1328: info->block_size = a->bs2;
1329: info->nz_allocated = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1330: info->nz_used = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1331: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1332: info->assemblies = A->num_ass;
1333: info->mallocs = a->reallocs;
1334: info->memory = A->mem;
1335: if (A->factor) {
1336: info->fill_ratio_given = A->info.fill_ratio_given;
1337: info->fill_ratio_needed = A->info.fill_ratio_needed;
1338: info->factor_mallocs = A->info.factor_mallocs;
1339: } else {
1340: info->fill_ratio_given = 0;
1341: info->fill_ratio_needed = 0;
1342: info->factor_mallocs = 0;
1343: }
1344: return(0);
1345: }
1348: int MatZeroEntries_SeqSBAIJ(Mat A)
1349: {
1350: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1351: int ierr;
1354: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1355: return(0);
1356: }
1358: int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1359: {
1360: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1361: int ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1362: PetscReal atmp;
1363: MatScalar *aa;
1364: Scalar zero = 0.0,*x;
1365: int ncols,brow,bcol,krow,kcol;
1368: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1369: bs = a->bs;
1370: aa = a->a;
1371: ai = a->i;
1372: aj = a->j;
1373: mbs = a->mbs;
1375: VecSet(&zero,v);
1376: VecGetArray(v,&x);
1377: VecGetLocalSize(v,&n);
1378: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1379: for (i=0; i<mbs; i++) {
1380: ncols = ai[1] - ai[0]; ai++;
1381: brow = bs*i;
1382: for (j=0; j<ncols; j++){
1383: bcol = bs*(*aj);
1384: for (kcol=0; kcol<bs; kcol++){
1385: col = bcol + kcol; /* col index */
1386: for (krow=0; krow<bs; krow++){
1387: atmp = PetscAbsScalar(*aa); aa++;
1388: row = brow + krow; /* row index */
1389: /* printf("val[%d,%d]: %gn",row,col,atmp); */
1390: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1391: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1392: }
1393: }
1394: aj++;
1395: }
1396: }
1397: VecRestoreArray(v,&x);
1398: return(0);
1399: }