Actual source code: sbaij2.c
1: /*$Id: sbaij2.c,v 1.32 2001/08/07 03:03:01 balay Exp $*/
3: #include src/mat/impls/baij/seq/baij.h
4: #include src/vec/vecimpl.h
5: #include src/inline/spops.h
6: #include src/inline/ilu.h
7: #include petscbt.h
8: #include src/mat/impls/sbaij/seq/sbaij.h
12: int MatIncreaseOverlap_SeqSBAIJ(Mat A,int is_max,IS is[],int ov)
13: {
15: SETERRQ(1,"Function not yet written for SBAIJ format");
16: /* return(0); */
17: }
21: int MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
22: {
23: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
24: int *smap,i,k,kstart,kend,ierr,oldcols = a->mbs,*lens;
25: int row,mat_i,*mat_j,tcol,*mat_ilen;
26: int *irow,nrows,*ssmap,bs=a->bs,bs2=a->bs2;
27: int *aj = a->j,*ai = a->i;
28: MatScalar *mat_a;
29: Mat C;
30: PetscTruth flag;
33:
34: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
35: ISSorted(iscol,(PetscTruth*)&i);
36: if (!i) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
38: ISGetIndices(isrow,&irow);
39: ISGetSize(isrow,&nrows);
40:
41: PetscMalloc((1+oldcols)*sizeof(int),&smap);
42: ssmap = smap;
43: PetscMalloc((1+nrows)*sizeof(int),&lens);
44: PetscMemzero(smap,oldcols*sizeof(int));
45: for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
46: /* determine lens of each row */
47: for (i=0; i<nrows; i++) {
48: kstart = ai[irow[i]];
49: kend = kstart + a->ilen[irow[i]];
50: lens[i] = 0;
51: for (k=kstart; k<kend; k++) {
52: if (ssmap[aj[k]]) {
53: lens[i]++;
54: }
55: }
56: }
57: /* Create and fill new matrix */
58: if (scall == MAT_REUSE_MATRIX) {
59: c = (Mat_SeqSBAIJ *)((*B)->data);
61: if (c->mbs!=nrows || c->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
62: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(int),&flag);
63: if (flag == PETSC_FALSE) {
64: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
65: }
66: PetscMemzero(c->ilen,c->mbs*sizeof(int));
67: C = *B;
68: } else {
69: MatCreateSeqSBAIJ(A->comm,bs,nrows*bs,nrows*bs,0,lens,&C);
70: }
71: c = (Mat_SeqSBAIJ *)(C->data);
72: for (i=0; i<nrows; i++) {
73: row = irow[i];
74: kstart = ai[row];
75: kend = kstart + a->ilen[row];
76: mat_i = c->i[i];
77: mat_j = c->j + mat_i;
78: mat_a = c->a + mat_i*bs2;
79: mat_ilen = c->ilen + i;
80: for (k=kstart; k<kend; k++) {
81: if ((tcol=ssmap[a->j[k]])) {
82: *mat_j++ = tcol - 1;
83: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
84: mat_a += bs2;
85: (*mat_ilen)++;
86: }
87: }
88: }
89:
90: /* Free work space */
91: PetscFree(smap);
92: PetscFree(lens);
93: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
94: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
95:
96: ISRestoreIndices(isrow,&irow);
97: *B = C;
98: return(0);
99: }
103: int MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,int cs,MatReuse scall,Mat *B)
104: {
105: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
106: IS is1;
107: int *vary,*iary,*irow,nrows,i,ierr,bs=a->bs,count;
110: if (isrow != iscol) SETERRQ(1,"MatGetSubmatrices_SeqSBAIJ: For symm. format, iscol must equal isro");
111:
112: ISGetIndices(isrow,&irow);
113: ISGetSize(isrow,&nrows);
114:
115: /* Verify if the indices corespond to each element in a block
116: and form the IS with compressed IS */
117: PetscMalloc(2*(a->mbs+1)*sizeof(int),&vary);
118: iary = vary + a->mbs;
119: PetscMemzero(vary,(a->mbs)*sizeof(int));
120: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
121:
122: count = 0;
123: for (i=0; i<a->mbs; i++) {
124: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(1,"Index set does not match blocks");
125: if (vary[i]==bs) iary[count++] = i;
126: }
127: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
128:
129: ISRestoreIndices(isrow,&irow);
130: PetscFree(vary);
132: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,cs,scall,B);
133: ISDestroy(is1);
134: return(0);
135: }
139: int MatGetSubMatrices_SeqSBAIJ(Mat A,int n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
140: {
141: int ierr,i;
144: if (scall == MAT_INITIAL_MATRIX) {
145: PetscMalloc((n+1)*sizeof(Mat),B);
146: }
148: for (i=0; i<n; i++) {
149: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],PETSC_DECIDE,scall,&(*B)[i]);
150: }
151: return(0);
152: }
154: /* -------------------------------------------------------*/
155: /* Should check that shapes of vectors and matrices match */
156: /* -------------------------------------------------------*/
157: #include petscblaslapack.h
161: int MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
162: {
163: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
164: PetscScalar *x,*z,*xb,x1,zero=0.0;
165: MatScalar *v;
166: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
169: VecSet(&zero,zz);
170: VecGetArray(xx,&x);
171: VecGetArray(zz,&z);
173: v = a->a;
174: xb = x;
175:
176: for (i=0; i<mbs; i++) {
177: n = ai[1] - ai[0]; /* length of i_th row of A */
178: x1 = xb[0];
179: ib = aj + *ai;
180: jmin = 0;
181: if (*ib == i) { /* (diag of A)*x */
182: z[i] += *v++ * x[*ib++];
183: jmin++;
184: }
185: for (j=jmin; j<n; j++) {
186: cval = *ib;
187: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
188: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
189: }
190: xb++; ai++;
191: }
193: VecRestoreArray(xx,&x);
194: VecRestoreArray(zz,&z);
195: PetscLogFlops(2*(a->s_nz*2 - A->m) - A->m); /* s_nz = (nz+m)/2 */
196: return(0);
197: }
201: int MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
202: {
203: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
204: PetscScalar *x,*z,*xb,x1,x2,zero=0.0;
205: MatScalar *v;
206: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
210: VecSet(&zero,zz);
211: VecGetArray(xx,&x);
212: VecGetArray(zz,&z);
213:
214: v = a->a;
215: xb = x;
217: for (i=0; i<mbs; i++) {
218: n = ai[1] - ai[0]; /* length of i_th block row of A */
219: x1 = xb[0]; x2 = xb[1];
220: ib = aj + *ai;
221: jmin = 0;
222: if (*ib == i){ /* (diag of A)*x */
223: z[2*i] += v[0]*x1 + v[2]*x2;
224: z[2*i+1] += v[2]*x1 + v[3]*x2;
225: v += 4; jmin++;
226: }
227: for (j=jmin; j<n; j++) {
228: /* (strict lower triangular part of A)*x */
229: cval = ib[j]*2;
230: z[cval] += v[0]*x1 + v[1]*x2;
231: z[cval+1] += v[2]*x1 + v[3]*x2;
232: /* (strict upper triangular part of A)*x */
233: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
234: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
235: v += 4;
236: }
237: xb +=2; ai++;
238: }
240: VecRestoreArray(xx,&x);
241: VecRestoreArray(zz,&z);
242: PetscLogFlops(8*(a->s_nz*2 - A->m) - A->m);
243: return(0);
244: }
248: int MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
249: {
250: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
251: PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0;
252: MatScalar *v;
253: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
257: VecSet(&zero,zz);
258: VecGetArray(xx,&x);
259: VecGetArray(zz,&z);
260:
261: v = a->a;
262: xb = x;
264: for (i=0; i<mbs; i++) {
265: n = ai[1] - ai[0]; /* length of i_th block row of A */
266: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
267: ib = aj + *ai;
268: jmin = 0;
269: if (*ib == i){ /* (diag of A)*x */
270: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
271: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
272: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
273: v += 9; jmin++;
274: }
275: for (j=jmin; j<n; j++) {
276: /* (strict lower triangular part of A)*x */
277: cval = ib[j]*3;
278: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
279: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
280: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
281: /* (strict upper triangular part of A)*x */
282: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
283: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
284: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
285: v += 9;
286: }
287: xb +=3; ai++;
288: }
290: VecRestoreArray(xx,&x);
291: VecRestoreArray(zz,&z);
292: PetscLogFlops(18*(a->s_nz*2 - A->m) - A->m);
293: return(0);
294: }
298: int MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
299: {
300: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
301: PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
302: MatScalar *v;
303: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
306: VecSet(&zero,zz);
307: VecGetArray(xx,&x);
308: VecGetArray(zz,&z);
309:
310: v = a->a;
311: xb = x;
313: for (i=0; i<mbs; i++) {
314: n = ai[1] - ai[0]; /* length of i_th block row of A */
315: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
316: ib = aj + *ai;
317: jmin = 0;
318: if (*ib == i){ /* (diag of A)*x */
319: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
320: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
321: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
322: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
323: v += 16; jmin++;
324: }
325: for (j=jmin; j<n; j++) {
326: /* (strict lower triangular part of A)*x */
327: cval = ib[j]*4;
328: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
329: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
330: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
331: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
332: /* (strict upper triangular part of A)*x */
333: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
334: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
335: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
336: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
337: v += 16;
338: }
339: xb +=4; ai++;
340: }
342: VecRestoreArray(xx,&x);
343: VecRestoreArray(zz,&z);
344: PetscLogFlops(32*(a->s_nz*2 - A->m) - A->m);
345: return(0);
346: }
350: int MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
351: {
352: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
353: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
354: MatScalar *v;
355: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
358: VecSet(&zero,zz);
359: VecGetArray(xx,&x);
360: VecGetArray(zz,&z);
361:
362: v = a->a;
363: xb = x;
365: for (i=0; i<mbs; i++) {
366: n = ai[1] - ai[0]; /* length of i_th block row of A */
367: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
368: ib = aj + *ai;
369: jmin = 0;
370: if (*ib == i){ /* (diag of A)*x */
371: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
372: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
373: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
374: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
375: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
376: v += 25; jmin++;
377: }
378: for (j=jmin; j<n; j++) {
379: /* (strict lower triangular part of A)*x */
380: cval = ib[j]*5;
381: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
382: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
383: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
384: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
385: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
386: /* (strict upper triangular part of A)*x */
387: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
388: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
389: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
390: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
391: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
392: v += 25;
393: }
394: xb +=5; ai++;
395: }
397: VecRestoreArray(xx,&x);
398: VecRestoreArray(zz,&z);
399: PetscLogFlops(50*(a->s_nz*2 - A->m) - A->m);
400: return(0);
401: }
406: int MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
407: {
408: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
409: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
410: MatScalar *v;
411: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
414: VecSet(&zero,zz);
415: VecGetArray(xx,&x);
416: VecGetArray(zz,&z);
417:
418: v = a->a;
419: xb = x;
421: for (i=0; i<mbs; i++) {
422: n = ai[1] - ai[0]; /* length of i_th block row of A */
423: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
424: ib = aj + *ai;
425: jmin = 0;
426: if (*ib == i){ /* (diag of A)*x */
427: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
428: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
429: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
430: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
431: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
432: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
433: v += 36; jmin++;
434: }
435: for (j=jmin; j<n; j++) {
436: /* (strict lower triangular part of A)*x */
437: cval = ib[j]*6;
438: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
439: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
440: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
441: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
442: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
443: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
444: /* (strict upper triangular part of A)*x */
445: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
446: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
447: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
448: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
449: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
450: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
451: v += 36;
452: }
453: xb +=6; ai++;
454: }
456: VecRestoreArray(xx,&x);
457: VecRestoreArray(zz,&z);
458: PetscLogFlops(72*(a->s_nz*2 - A->m) - A->m);
459: return(0);
460: }
463: int MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
464: {
465: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
466: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
467: MatScalar *v;
468: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
471: VecSet(&zero,zz);
472: VecGetArray(xx,&x);
473: VecGetArray(zz,&z);
474:
475: v = a->a;
476: xb = x;
478: for (i=0; i<mbs; i++) {
479: n = ai[1] - ai[0]; /* length of i_th block row of A */
480: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
481: ib = aj + *ai;
482: jmin = 0;
483: if (*ib == i){ /* (diag of A)*x */
484: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
485: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
486: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
487: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
488: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
489: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
490: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
491: v += 49; jmin++;
492: }
493: for (j=jmin; j<n; j++) {
494: /* (strict lower triangular part of A)*x */
495: cval = ib[j]*7;
496: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503: /* (strict upper triangular part of A)*x */
504: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511: v += 49;
512: }
513: xb +=7; ai++;
514: }
515: VecRestoreArray(xx,&x);
516: VecRestoreArray(zz,&z);
517: PetscLogFlops(98*(a->s_nz*2 - A->m) - A->m);
518: return(0);
519: }
521: /*
522: This will not work with MatScalar == float because it calls the BLAS
523: */
526: int MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527: {
528: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
529: PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
530: MatScalar *v;
531: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
532: int ncols,k;
535: VecSet(&zero,zz);
536: VecGetArray(xx,&x); x_ptr=x;
537: VecGetArray(zz,&z); z_ptr=z;
539: aj = a->j;
540: v = a->a;
541: ii = a->i;
543: if (!a->mult_work) {
544: PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);
545: }
546: work = a->mult_work;
547:
548: for (i=0; i<mbs; i++) {
549: n = ii[1] - ii[0]; ncols = n*bs;
550: workt = work; idx=aj+ii[0];
552: /* upper triangular part */
553: for (j=0; j<n; j++) {
554: xb = x_ptr + bs*(*idx++);
555: for (k=0; k<bs; k++) workt[k] = xb[k];
556: workt += bs;
557: }
558: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
559: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
560:
561: /* strict lower triangular part */
562: idx = aj+ii[0];
563: if (*idx == i){
564: ncols -= bs; v += bs2; idx++; n--;
565: }
566:
567: if (ncols > 0){
568: workt = work;
569: PetscMemzero(workt,ncols*sizeof(PetscScalar));
570: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
571: for (j=0; j<n; j++) {
572: zb = z_ptr + bs*(*idx++);
573: for (k=0; k<bs; k++) zb[k] += workt[k] ;
574: workt += bs;
575: }
576: }
577: x += bs; v += n*bs2; z += bs; ii++;
578: }
579:
580: VecRestoreArray(xx,&x);
581: VecRestoreArray(zz,&z);
582: PetscLogFlops(2*(a->s_nz*2 - A->m)*bs2 - A->m);
583: return(0);
584: }
588: int MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
589: {
590: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
591: PetscScalar *x,*y,*z,*xb,x1;
592: MatScalar *v;
593: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
596: VecGetArray(xx,&x);
597: if (yy != xx) {
598: VecGetArray(yy,&y);
599: } else {
600: y = x;
601: }
602: if (zz != yy) {
603: /* VecCopy(yy,zz); */
604: VecGetArray(zz,&z);
605: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
606: } else {
607: z = y;
608: }
610: v = a->a;
611: xb = x;
613: for (i=0; i<mbs; i++) {
614: n = ai[1] - ai[0]; /* length of i_th row of A */
615: x1 = xb[0];
616: ib = aj + *ai;
617: jmin = 0;
618: if (*ib == i) { /* (diag of A)*x */
619: z[i] += *v++ * x[*ib++]; jmin++;
620: }
621: for (j=jmin; j<n; j++) {
622: cval = *ib;
623: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
624: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
625: }
626: xb++; ai++;
627: }
629: VecRestoreArray(xx,&x);
630: if (yy != xx) VecRestoreArray(yy,&y);
631: if (zz != yy) VecRestoreArray(zz,&z);
632:
633: PetscLogFlops(2*(a->s_nz*2 - A->m));
634: return(0);
635: }
639: int MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
640: {
641: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
642: PetscScalar *x,*y,*z,*xb,x1,x2;
643: MatScalar *v;
644: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
647: VecGetArray(xx,&x);
648: if (yy != xx) {
649: VecGetArray(yy,&y);
650: } else {
651: y = x;
652: }
653: if (zz != yy) {
654: /* VecCopy(yy,zz); */
655: VecGetArray(zz,&z);
656: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
657: } else {
658: z = y;
659: }
661: v = a->a;
662: xb = x;
664: for (i=0; i<mbs; i++) {
665: n = ai[1] - ai[0]; /* length of i_th block row of A */
666: x1 = xb[0]; x2 = xb[1];
667: ib = aj + *ai;
668: jmin = 0;
669: if (*ib == i){ /* (diag of A)*x */
670: z[2*i] += v[0]*x1 + v[2]*x2;
671: z[2*i+1] += v[2]*x1 + v[3]*x2;
672: v += 4; jmin++;
673: }
674: for (j=jmin; j<n; j++) {
675: /* (strict lower triangular part of A)*x */
676: cval = ib[j]*2;
677: z[cval] += v[0]*x1 + v[1]*x2;
678: z[cval+1] += v[2]*x1 + v[3]*x2;
679: /* (strict upper triangular part of A)*x */
680: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
681: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
682: v += 4;
683: }
684: xb +=2; ai++;
685: }
687: VecRestoreArray(xx,&x);
688: if (yy != xx) VecRestoreArray(yy,&y);
689: if (zz != yy) VecRestoreArray(zz,&z);
691: PetscLogFlops(4*(a->s_nz*2 - A->m));
692: return(0);
693: }
697: int MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
698: {
699: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
700: PetscScalar *x,*y,*z,*xb,x1,x2,x3;
701: MatScalar *v;
702: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
705: VecGetArray(xx,&x);
706: if (yy != xx) {
707: VecGetArray(yy,&y);
708: } else {
709: y = x;
710: }
711: if (zz != yy) {
712: /* VecCopy(yy,zz); */
713: VecGetArray(zz,&z);
714: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
715: } else {
716: z = y;
717: }
719: v = a->a;
720: xb = x;
722: for (i=0; i<mbs; i++) {
723: n = ai[1] - ai[0]; /* length of i_th block row of A */
724: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
725: ib = aj + *ai;
726: jmin = 0;
727: if (*ib == i){ /* (diag of A)*x */
728: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
729: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
730: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
731: v += 9; jmin++;
732: }
733: for (j=jmin; j<n; j++) {
734: /* (strict lower triangular part of A)*x */
735: cval = ib[j]*3;
736: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
737: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
738: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
739: /* (strict upper triangular part of A)*x */
740: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
741: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
742: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
743: v += 9;
744: }
745: xb +=3; ai++;
746: }
748: VecRestoreArray(xx,&x);
749: if (yy != xx) VecRestoreArray(yy,&y);
750: if (zz != yy) VecRestoreArray(zz,&z);
752: PetscLogFlops(18*(a->s_nz*2 - A->m));
753: return(0);
754: }
758: int MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
759: {
760: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
761: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4;
762: MatScalar *v;
763: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
766: VecGetArray(xx,&x);
767: if (yy != xx) {
768: VecGetArray(yy,&y);
769: } else {
770: y = x;
771: }
772: if (zz != yy) {
773: /* VecCopy(yy,zz); */
774: VecGetArray(zz,&z);
775: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
776: } else {
777: z = y;
778: }
780: v = a->a;
781: xb = x;
783: for (i=0; i<mbs; i++) {
784: n = ai[1] - ai[0]; /* length of i_th block row of A */
785: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
786: ib = aj + *ai;
787: jmin = 0;
788: if (*ib == i){ /* (diag of A)*x */
789: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
790: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
791: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
792: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
793: v += 16; jmin++;
794: }
795: for (j=jmin; j<n; j++) {
796: /* (strict lower triangular part of A)*x */
797: cval = ib[j]*4;
798: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
799: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
800: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
801: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
802: /* (strict upper triangular part of A)*x */
803: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
804: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
805: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
806: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
807: v += 16;
808: }
809: xb +=4; ai++;
810: }
812: VecRestoreArray(xx,&x);
813: if (yy != xx) VecRestoreArray(yy,&y);
814: if (zz != yy) VecRestoreArray(zz,&z);
816: PetscLogFlops(32*(a->s_nz*2 - A->m));
817: return(0);
818: }
822: int MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
823: {
824: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
825: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5;
826: MatScalar *v;
827: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
830: VecGetArray(xx,&x);
831: if (yy != xx) {
832: VecGetArray(yy,&y);
833: } else {
834: y = x;
835: }
836: if (zz != yy) {
837: /* VecCopy(yy,zz); */
838: VecGetArray(zz,&z);
839: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
840: } else {
841: z = y;
842: }
844: v = a->a;
845: xb = x;
847: for (i=0; i<mbs; i++) {
848: n = ai[1] - ai[0]; /* length of i_th block row of A */
849: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
850: ib = aj + *ai;
851: jmin = 0;
852: if (*ib == i){ /* (diag of A)*x */
853: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
854: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
855: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
856: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
857: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
858: v += 25; jmin++;
859: }
860: for (j=jmin; j<n; j++) {
861: /* (strict lower triangular part of A)*x */
862: cval = ib[j]*5;
863: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
864: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
865: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
866: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
867: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
868: /* (strict upper triangular part of A)*x */
869: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
870: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
871: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
872: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
873: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
874: v += 25;
875: }
876: xb +=5; ai++;
877: }
879: VecRestoreArray(xx,&x);
880: if (yy != xx) VecRestoreArray(yy,&y);
881: if (zz != yy) VecRestoreArray(zz,&z);
883: PetscLogFlops(50*(a->s_nz*2 - A->m));
884: return(0);
885: }
888: int MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
889: {
890: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
891: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6;
892: MatScalar *v;
893: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
896: VecGetArray(xx,&x);
897: if (yy != xx) {
898: VecGetArray(yy,&y);
899: } else {
900: y = x;
901: }
902: if (zz != yy) {
903: /* VecCopy(yy,zz); */
904: VecGetArray(zz,&z);
905: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
906: } else {
907: z = y;
908: }
910: v = a->a;
911: xb = x;
913: for (i=0; i<mbs; i++) {
914: n = ai[1] - ai[0]; /* length of i_th block row of A */
915: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
916: ib = aj + *ai;
917: jmin = 0;
918: if (*ib == i){ /* (diag of A)*x */
919: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
920: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
921: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
922: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
923: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
924: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
925: v += 36; jmin++;
926: }
927: for (j=jmin; j<n; j++) {
928: /* (strict lower triangular part of A)*x */
929: cval = ib[j]*6;
930: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
931: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
932: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
933: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
934: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
935: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
936: /* (strict upper triangular part of A)*x */
937: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
938: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
939: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
940: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
941: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
942: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
943: v += 36;
944: }
945: xb +=6; ai++;
946: }
948: VecRestoreArray(xx,&x);
949: if (yy != xx) VecRestoreArray(yy,&y);
950: if (zz != yy) VecRestoreArray(zz,&z);
952: PetscLogFlops(72*(a->s_nz*2 - A->m));
953: return(0);
954: }
958: int MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
959: {
960: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
961: PetscScalar *x,*y,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
962: MatScalar *v;
963: int mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,ierr,*ib,cval,j,jmin;
966: VecGetArray(xx,&x);
967: if (yy != xx) {
968: VecGetArray(yy,&y);
969: } else {
970: y = x;
971: }
972: if (zz != yy) {
973: /* VecCopy(yy,zz); */
974: VecGetArray(zz,&z);
975: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
976: } else {
977: z = y;
978: }
980: v = a->a;
981: xb = x;
983: for (i=0; i<mbs; i++) {
984: n = ai[1] - ai[0]; /* length of i_th block row of A */
985: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
986: ib = aj + *ai;
987: jmin = 0;
988: if (*ib == i){ /* (diag of A)*x */
989: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
990: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
991: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
992: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
993: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
994: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
995: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
996: v += 49; jmin++;
997: }
998: for (j=jmin; j<n; j++) {
999: /* (strict lower triangular part of A)*x */
1000: cval = ib[j]*7;
1001: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1002: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1003: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1004: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1005: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1006: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1007: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1008: /* (strict upper triangular part of A)*x */
1009: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1010: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1011: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1012: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1013: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1014: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1015: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1016: v += 49;
1017: }
1018: xb +=7; ai++;
1019: }
1021: VecRestoreArray(xx,&x);
1022: if (yy != xx) VecRestoreArray(yy,&y);
1023: if (zz != yy) VecRestoreArray(zz,&z);
1025: PetscLogFlops(98*(a->s_nz*2 - A->m));
1026: return(0);
1027: }
1031: int MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1032: {
1033: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1034: PetscScalar *x,*x_ptr,*y,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1035: MatScalar *v;
1036: int ierr,mbs=a->mbs,i,*idx,*aj,*ii,bs=a->bs,j,n,bs2=a->bs2;
1037: int ncols,k;
1040: VecGetArray(xx,&x); x_ptr=x;
1041: if (yy != xx) {
1042: VecGetArray(yy,&y);
1043: } else {
1044: y = x;
1045: }
1046: if (zz != yy) {
1047: /* VecCopy(yy,zz); */
1048: VecGetArray(zz,&z); z_ptr=z;
1049: PetscMemcpy(z,y,yy->n*sizeof(PetscScalar));
1050: } else {
1051: z = y;
1052: }
1054: aj = a->j;
1055: v = a->a;
1056: ii = a->i;
1058: if (!a->mult_work) {
1059: PetscMalloc((A->m+1)*sizeof(PetscScalar),&a->mult_work);
1060: }
1061: work = a->mult_work;
1062:
1063:
1064: for (i=0; i<mbs; i++) {
1065: n = ii[1] - ii[0]; ncols = n*bs;
1066: workt = work; idx=aj+ii[0];
1068: /* upper triangular part */
1069: for (j=0; j<n; j++) {
1070: xb = x_ptr + bs*(*idx++);
1071: for (k=0; k<bs; k++) workt[k] = xb[k];
1072: workt += bs;
1073: }
1074: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1075: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1077: /* strict lower triangular part */
1078: idx = aj+ii[0];
1079: if (*idx == i){
1080: ncols -= bs; v += bs2; idx++; n--;
1081: }
1082: if (ncols > 0){
1083: workt = work;
1084: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1085: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1086: for (j=0; j<n; j++) {
1087: zb = z_ptr + bs*(*idx++);
1088: /* idx++; */
1089: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1090: workt += bs;
1091: }
1092: }
1094: x += bs; v += n*bs2; z += bs; ii++;
1095: }
1097: VecRestoreArray(xx,&x);
1098: if (yy != xx) VecRestoreArray(yy,&y);
1099: if (zz != yy) VecRestoreArray(zz,&z);
1101: PetscLogFlops(2*(a->s_nz*2 - A->m));
1102: return(0);
1103: }
1107: int MatMultTranspose_SeqSBAIJ(Mat A,Vec xx,Vec zz)
1108: {
1110: SETERRQ(1,"Matrix is symmetric. Call MatMult().");
1111: /* return(0); */
1112: }
1116: int MatMultTransposeAdd_SeqSBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1118: {
1120: SETERRQ(1,"Matrix is symmetric. Call MatMultAdd().");
1121: /* return(0); */
1122: }
1126: int MatScale_SeqSBAIJ(const PetscScalar *alpha,Mat inA)
1127: {
1128: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1129: int one = 1,totalnz = a->bs2*a->s_nz;
1132: BLscal_(&totalnz,(PetscScalar*)alpha,a->a,&one);
1133: PetscLogFlops(totalnz);
1134: return(0);
1135: }
1139: int MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1140: {
1141: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1142: MatScalar *v = a->a;
1143: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1144: int i,j,k,bs = a->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1145: int *jl,*il,jmin,jmax,ierr,nexti,ik,*col;
1146:
1148: if (type == NORM_FROBENIUS) {
1149: for (k=0; k<mbs; k++){
1150: jmin = a->i[k]; jmax = a->i[k+1];
1151: col = aj + jmin;
1152: if (*col == k){ /* diagonal block */
1153: for (i=0; i<bs2; i++){
1154: #if defined(PETSC_USE_COMPLEX)
1155: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1156: #else
1157: sum_diag += (*v)*(*v); v++;
1158: #endif
1159: }
1160: jmin++;
1161: }
1162: for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */
1163: for (i=0; i<bs2; i++){
1164: #if defined(PETSC_USE_COMPLEX)
1165: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1166: #else
1167: sum_off += (*v)*(*v); v++;
1168: #endif
1169: }
1170: }
1171: }
1172: *norm = sqrt(sum_diag + 2*sum_off);
1174: } else if (type == NORM_INFINITY) { /* maximum row sum */
1175: PetscMalloc(mbs*sizeof(int),&il);
1176: PetscMalloc(mbs*sizeof(int),&jl);
1177: PetscMalloc(bs*sizeof(PetscReal),&sum);
1178: for (i=0; i<mbs; i++) {
1179: jl[i] = mbs; il[0] = 0;
1180: }
1182: *norm = 0.0;
1183: for (k=0; k<mbs; k++) { /* k_th block row */
1184: for (j=0; j<bs; j++) sum[j]=0.0;
1186: /*-- col sum --*/
1187: i = jl[k]; /* first |A(i,k)| to be added */
1188: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1189: at step k */
1190: while (i<mbs){
1191: nexti = jl[i]; /* next block row to be added */
1192: ik = il[i]; /* block index of A(i,k) in the array a */
1193: for (j=0; j<bs; j++){
1194: v = a->a + ik*bs2 + j*bs;
1195: for (k1=0; k1<bs; k1++) {
1196: sum[j] += PetscAbsScalar(*v); v++;
1197: }
1198: }
1199: /* update il, jl */
1200: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1201: jmax = a->i[i+1];
1202: if (jmin < jmax){
1203: il[i] = jmin;
1204: j = a->j[jmin];
1205: jl[i] = jl[j]; jl[j]=i;
1206: }
1207: i = nexti;
1208: }
1209:
1210: /*-- row sum --*/
1211: jmin = a->i[k]; jmax = a->i[k+1];
1212: for (i=jmin; i<jmax; i++) {
1213: for (j=0; j<bs; j++){
1214: v = a->a + i*bs2 + j;
1215: for (k1=0; k1<bs; k1++){
1216: sum[j] += PetscAbsScalar(*v);
1217: v += bs;
1218: }
1219: }
1220: }
1221: /* add k_th block row to il, jl */
1222: col = aj+jmin;
1223: if (*col == k) jmin++;
1224: if (jmin < jmax){
1225: il[k] = jmin;
1226: j = a->j[jmin];
1227: jl[k] = jl[j]; jl[j] = k;
1228: }
1229: for (j=0; j<bs; j++){
1230: if (sum[j] > *norm) *norm = sum[j];
1231: }
1232: }
1233: PetscFree(il);
1234: PetscFree(jl);
1235: PetscFree(sum);
1236: } else {
1237: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1238: }
1239: return(0);
1240: }
1244: int MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1245: {
1246: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1247: int ierr;
1251: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1252: if ((A->m != B->m) || (A->n != B->n) || (a->bs != b->bs)|| (a->s_nz != b->s_nz)) {
1253: *flg = PETSC_FALSE;
1254: return(0);
1255: }
1256:
1257: /* if the a->i are the same */
1258: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(int),flg);
1259: if (*flg == PETSC_FALSE) {
1260: return(0);
1261: }
1262:
1263: /* if a->j are the same */
1264: PetscMemcmp(a->j,b->j,(a->s_nz)*sizeof(int),flg);
1265: if (*flg == PETSC_FALSE) {
1266: return(0);
1267: }
1268: /* if a->a are the same */
1269: PetscMemcmp(a->a,b->a,(a->s_nz)*(a->bs)*(a->bs)*sizeof(PetscScalar),flg);
1270:
1271: return(0);
1272: }
1276: int MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1277: {
1278: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1279: int ierr,i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1280: PetscScalar *x,zero = 0.0;
1281: MatScalar *aa,*aa_j;
1284: bs = a->bs;
1285: if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1286:
1287: aa = a->a;
1288: ai = a->i;
1289: aj = a->j;
1290: ambs = a->mbs;
1291: bs2 = a->bs2;
1293: VecSet(&zero,v);
1294: VecGetArray(v,&x);
1295: VecGetLocalSize(v,&n);
1296: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1297: for (i=0; i<ambs; i++) {
1298: j=ai[i];
1299: if (aj[j] == i) { /* if this is a diagonal element */
1300: row = i*bs;
1301: aa_j = aa + j*bs2;
1302: if (A->factor && bs==1){
1303: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1304: } else {
1305: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1306: }
1307: }
1308: }
1309:
1310: VecRestoreArray(v,&x);
1311: return(0);
1312: }
1316: int MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1317: {
1318: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1319: PetscScalar *l,*r,x,*li,*ri;
1320: MatScalar *aa,*v;
1321: int ierr,i,j,k,lm,rn,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1324: ai = a->i;
1325: aj = a->j;
1326: aa = a->a;
1327: m = A->m;
1328: bs = a->bs;
1329: mbs = a->mbs;
1330: bs2 = a->bs2;
1332: if (ll != rr) {
1333: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1334: }
1335: if (ll) {
1336: VecGetArray(ll,&l);
1337: VecGetLocalSize(ll,&lm);
1338: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1339: for (i=0; i<mbs; i++) { /* for each block row */
1340: M = ai[i+1] - ai[i];
1341: li = l + i*bs;
1342: v = aa + bs2*ai[i];
1343: for (j=0; j<M; j++) { /* for each block */
1344: for (k=0; k<bs2; k++) {
1345: (*v++) *= li[k%bs];
1346: }
1347: #ifdef CONT
1348: /* will be used to replace the above loop */
1349: ri = l + bs*aj[ai[i]+j];
1350: for (k=0; k<bs; k++) { /* column value */
1351: x = ri[k];
1352: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1353: }
1354: #endif
1356: }
1357: }
1358: VecRestoreArray(ll,&l);
1359: PetscLogFlops(2*a->s_nz);
1360: }
1361: /* will be deleted */
1362: if (rr) {
1363: VecGetArray(rr,&r);
1364: VecGetLocalSize(rr,&rn);
1365: if (rn != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
1366: for (i=0; i<mbs; i++) { /* for each block row */
1367: M = ai[i+1] - ai[i];
1368: v = aa + bs2*ai[i];
1369: for (j=0; j<M; j++) { /* for each block */
1370: ri = r + bs*aj[ai[i]+j];
1371: for (k=0; k<bs; k++) {
1372: x = ri[k];
1373: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
1374: }
1375: }
1376: }
1377: VecRestoreArray(rr,&r);
1378: PetscLogFlops(a->s_nz);
1379: }
1380: return(0);
1381: }
1385: int MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1386: {
1387: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1390: info->rows_global = (double)A->m;
1391: info->columns_global = (double)A->m;
1392: info->rows_local = (double)A->m;
1393: info->columns_local = (double)A->m;
1394: info->block_size = a->bs2;
1395: info->nz_allocated = a->s_maxnz; /*num. of nonzeros in upper triangular part */
1396: info->nz_used = a->bs2*a->s_nz; /*num. of nonzeros in upper triangular part */
1397: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1398: info->assemblies = A->num_ass;
1399: info->mallocs = a->reallocs;
1400: info->memory = A->mem;
1401: if (A->factor) {
1402: info->fill_ratio_given = A->info.fill_ratio_given;
1403: info->fill_ratio_needed = A->info.fill_ratio_needed;
1404: info->factor_mallocs = A->info.factor_mallocs;
1405: } else {
1406: info->fill_ratio_given = 0;
1407: info->fill_ratio_needed = 0;
1408: info->factor_mallocs = 0;
1409: }
1410: return(0);
1411: }
1416: int MatZeroEntries_SeqSBAIJ(Mat A)
1417: {
1418: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1419: int ierr;
1422: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1423: return(0);
1424: }
1428: int MatGetRowMax_SeqSBAIJ(Mat A,Vec v)
1429: {
1430: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1431: int ierr,i,j,n,row,col,bs,*ai,*aj,mbs;
1432: PetscReal atmp;
1433: MatScalar *aa;
1434: PetscScalar zero = 0.0,*x;
1435: int ncols,brow,bcol,krow,kcol;
1438: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1439: bs = a->bs;
1440: aa = a->a;
1441: ai = a->i;
1442: aj = a->j;
1443: mbs = a->mbs;
1445: VecSet(&zero,v);
1446: VecGetArray(v,&x);
1447: VecGetLocalSize(v,&n);
1448: if (n != A->m) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1449: for (i=0; i<mbs; i++) {
1450: ncols = ai[1] - ai[0]; ai++;
1451: brow = bs*i;
1452: for (j=0; j<ncols; j++){
1453: bcol = bs*(*aj);
1454: for (kcol=0; kcol<bs; kcol++){
1455: col = bcol + kcol; /* col index */
1456: for (krow=0; krow<bs; krow++){
1457: atmp = PetscAbsScalar(*aa); aa++;
1458: row = brow + krow; /* row index */
1459: /* printf("val[%d,%d]: %g\n",row,col,atmp); */
1460: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1461: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1462: }
1463: }
1464: aj++;
1465: }
1466: }
1467: VecRestoreArray(v,&x);
1468: return(0);
1469: }