Actual source code: sbaij2.c
petsc-3.7.0 2016-04-25
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <../src/mat/impls/sbaij/seq/sbaij.h>
6: #include <petscblaslapack.h>
10: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
14: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
15: const PetscInt *idx;
16: PetscBT table_out,table_in;
19: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
20: mbs = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
24: PetscBTCreate(mbs,&table_out);
25: PetscMalloc1(mbs+1,&nidx);
26: PetscMalloc1(A->rmap->N+1,&nidx2);
27: PetscBTCreate(mbs,&table_in);
29: for (i=0; i<is_max; i++) { /* for each is */
30: isz = 0;
31: PetscBTMemzero(mbs,table_out);
33: /* Extract the indices, assume there can be duplicate entries */
34: ISGetIndices(is[i],&idx);
35: ISGetLocalSize(is[i],&n);
37: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
38: bcol_max = 0;
39: for (j=0; j<n; ++j) {
40: brow = idx[j]/bs; /* convert the indices into block indices */
41: if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
42: if (!PetscBTLookupSet(table_out,brow)) {
43: nidx[isz++] = brow;
44: if (bcol_max < brow) bcol_max = brow;
45: }
46: }
47: ISRestoreIndices(is[i],&idx);
48: ISDestroy(&is[i]);
50: k = 0;
51: for (j=0; j<ov; j++) { /* for each overlap */
52: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
53: PetscBTMemzero(mbs,table_in);
54: for (l=k; l<isz; l++) { PetscBTSet(table_in,nidx[l]); }
56: n = isz; /* length of the updated is[i] */
57: for (brow=0; brow<mbs; brow++) {
58: start = ai[brow]; end = ai[brow+1];
59: if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
60: for (l = start; l<end; l++) {
61: bcol = aj[l];
62: if (!PetscBTLookupSet(table_out,bcol)) {
63: nidx[isz++] = bcol;
64: if (bcol_max < bcol) bcol_max = bcol;
65: }
66: }
67: k++;
68: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
69: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
70: for (l = start; l<end; l++) {
71: bcol = aj[l];
72: if (bcol > bcol_max) break;
73: if (PetscBTLookup(table_in,bcol)) {
74: if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
75: break; /* for l = start; l<end ; l++) */
76: }
77: }
78: }
79: }
80: } /* for each overlap */
82: /* expand the Index Set */
83: for (j=0; j<isz; j++) {
84: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
85: }
86: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
87: }
88: PetscBTDestroy(&table_out);
89: PetscFree(nidx);
90: PetscFree(nidx2);
91: PetscBTDestroy(&table_in);
92: return(0);
93: }
95: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
96: Zero some ops' to avoid invalid usse */
99: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
100: {
104: MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);
105: Bseq->ops->mult = 0;
106: Bseq->ops->multadd = 0;
107: Bseq->ops->multtranspose = 0;
108: Bseq->ops->multtransposeadd = 0;
109: Bseq->ops->lufactor = 0;
110: Bseq->ops->choleskyfactor = 0;
111: Bseq->ops->lufactorsymbolic = 0;
112: Bseq->ops->choleskyfactorsymbolic = 0;
113: Bseq->ops->getinertia = 0;
114: return(0);
115: }
119: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
120: {
124: MatGetSubMatrix_SeqBAIJ(A,isrow,iscol,scall,B);
126: if (isrow != iscol) {
127: PetscBool isequal;
128: ISEqual(isrow,iscol,&isequal);
129: if (!isequal) {
130: MatSeqSBAIJZeroOps_Private(*B);
131: }
132: }
133: return(0);
134: }
138: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
139: {
141: PetscInt i;
142: PetscBool flg;
145: MatGetSubMatrices_SeqBAIJ(A,n,irow,icol,scall,B);
146: for (i=0; i<n; i++) {
147: ISEqual(irow[i],icol[i],&flg);
148: if (!flg) {
149: MatSeqSBAIJZeroOps_Private(*B[i]);
150: }
151: }
152: return(0);
153: }
155: /* -------------------------------------------------------*/
156: /* Should check that shapes of vectors and matrices match */
157: /* -------------------------------------------------------*/
161: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
162: {
163: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
164: PetscScalar *z,x1,x2,zero=0.0;
165: const PetscScalar *x,*xb;
166: const MatScalar *v;
167: PetscErrorCode ierr;
168: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
169: const PetscInt *aj=a->j,*ai=a->i,*ib;
170: PetscInt nonzerorow=0;
173: VecSet(zz,zero);
174: VecGetArrayRead(xx,&x);
175: VecGetArray(zz,&z);
177: v = a->a;
178: xb = x;
180: for (i=0; i<mbs; i++) {
181: n = ai[1] - ai[0]; /* length of i_th block row of A */
182: x1 = xb[0]; x2 = xb[1];
183: ib = aj + *ai;
184: jmin = 0;
185: nonzerorow += (n>0);
186: if (*ib == i) { /* (diag of A)*x */
187: z[2*i] += v[0]*x1 + v[2]*x2;
188: z[2*i+1] += v[2]*x1 + v[3]*x2;
189: v += 4; jmin++;
190: }
191: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
192: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
193: for (j=jmin; j<n; j++) {
194: /* (strict lower triangular part of A)*x */
195: cval = ib[j]*2;
196: z[cval] += v[0]*x1 + v[1]*x2;
197: z[cval+1] += v[2]*x1 + v[3]*x2;
198: /* (strict upper triangular part of A)*x */
199: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
200: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
201: v += 4;
202: }
203: xb +=2; ai++;
204: }
206: VecRestoreArrayRead(xx,&x);
207: VecRestoreArray(zz,&z);
208: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
209: return(0);
210: }
214: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
215: {
216: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
217: PetscScalar *z,x1,x2,x3,zero=0.0;
218: const PetscScalar *x,*xb;
219: const MatScalar *v;
220: PetscErrorCode ierr;
221: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
222: const PetscInt *aj = a->j,*ai = a->i,*ib;
223: PetscInt nonzerorow=0;
226: VecSet(zz,zero);
227: VecGetArrayRead(xx,&x);
228: VecGetArray(zz,&z);
230: v = a->a;
231: xb = x;
233: for (i=0; i<mbs; i++) {
234: n = ai[1] - ai[0]; /* length of i_th block row of A */
235: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
236: ib = aj + *ai;
237: jmin = 0;
238: nonzerorow += (n>0);
239: if (*ib == i) { /* (diag of A)*x */
240: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
241: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
242: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
243: v += 9; jmin++;
244: }
245: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
246: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
247: for (j=jmin; j<n; j++) {
248: /* (strict lower triangular part of A)*x */
249: cval = ib[j]*3;
250: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
251: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
252: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
253: /* (strict upper triangular part of A)*x */
254: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
255: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
256: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
257: v += 9;
258: }
259: xb +=3; ai++;
260: }
262: VecRestoreArrayRead(xx,&x);
263: VecRestoreArray(zz,&z);
264: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
265: return(0);
266: }
270: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
271: {
272: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
273: PetscScalar *z,x1,x2,x3,x4,zero=0.0;
274: const PetscScalar *x,*xb;
275: const MatScalar *v;
276: PetscErrorCode ierr;
277: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
278: const PetscInt *aj = a->j,*ai = a->i,*ib;
279: PetscInt nonzerorow = 0;
282: VecSet(zz,zero);
283: VecGetArrayRead(xx,&x);
284: VecGetArray(zz,&z);
286: v = a->a;
287: xb = x;
289: for (i=0; i<mbs; i++) {
290: n = ai[1] - ai[0]; /* length of i_th block row of A */
291: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
292: ib = aj + *ai;
293: jmin = 0;
294: nonzerorow += (n>0);
295: if (*ib == i) { /* (diag of A)*x */
296: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
297: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
298: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
299: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
300: v += 16; jmin++;
301: }
302: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
303: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
304: for (j=jmin; j<n; j++) {
305: /* (strict lower triangular part of A)*x */
306: cval = ib[j]*4;
307: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
308: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
309: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
310: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
311: /* (strict upper triangular part of A)*x */
312: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
313: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
314: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
315: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
316: v += 16;
317: }
318: xb +=4; ai++;
319: }
321: VecRestoreArrayRead(xx,&x);
322: VecRestoreArray(zz,&z);
323: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
324: return(0);
325: }
329: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
330: {
331: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
332: PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
333: const PetscScalar *x,*xb;
334: const MatScalar *v;
335: PetscErrorCode ierr;
336: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
337: const PetscInt *aj = a->j,*ai = a->i,*ib;
338: PetscInt nonzerorow=0;
341: VecSet(zz,zero);
342: VecGetArrayRead(xx,&x);
343: VecGetArray(zz,&z);
345: v = a->a;
346: xb = x;
348: for (i=0; i<mbs; i++) {
349: n = ai[1] - ai[0]; /* length of i_th block row of A */
350: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
351: ib = aj + *ai;
352: jmin = 0;
353: nonzerorow += (n>0);
354: if (*ib == i) { /* (diag of A)*x */
355: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
356: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
357: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
358: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
359: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
360: v += 25; jmin++;
361: }
362: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
363: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364: for (j=jmin; j<n; j++) {
365: /* (strict lower triangular part of A)*x */
366: cval = ib[j]*5;
367: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
368: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
369: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
370: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
371: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
372: /* (strict upper triangular part of A)*x */
373: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
374: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
375: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
376: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
377: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
378: v += 25;
379: }
380: xb +=5; ai++;
381: }
383: VecRestoreArrayRead(xx,&x);
384: VecRestoreArray(zz,&z);
385: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
386: return(0);
387: }
392: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
393: {
394: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
395: PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
396: const PetscScalar *x,*xb;
397: const MatScalar *v;
398: PetscErrorCode ierr;
399: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
400: const PetscInt *aj=a->j,*ai=a->i,*ib;
401: PetscInt nonzerorow=0;
404: VecSet(zz,zero);
405: VecGetArrayRead(xx,&x);
406: VecGetArray(zz,&z);
408: v = a->a;
409: xb = x;
411: for (i=0; i<mbs; i++) {
412: n = ai[1] - ai[0]; /* length of i_th block row of A */
413: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
414: ib = aj + *ai;
415: jmin = 0;
416: nonzerorow += (n>0);
417: if (*ib == i) { /* (diag of A)*x */
418: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
419: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
420: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
421: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
422: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
423: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
424: v += 36; jmin++;
425: }
426: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
427: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
428: for (j=jmin; j<n; j++) {
429: /* (strict lower triangular part of A)*x */
430: cval = ib[j]*6;
431: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
432: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
433: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
434: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
435: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
436: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
437: /* (strict upper triangular part of A)*x */
438: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
439: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
440: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
441: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
442: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
443: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
444: v += 36;
445: }
446: xb +=6; ai++;
447: }
449: VecRestoreArrayRead(xx,&x);
450: VecRestoreArray(zz,&z);
451: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
452: return(0);
453: }
456: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
457: {
458: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
459: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
460: const PetscScalar *x,*xb;
461: const MatScalar *v;
462: PetscErrorCode ierr;
463: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
464: const PetscInt *aj=a->j,*ai=a->i,*ib;
465: PetscInt nonzerorow=0;
468: VecSet(zz,zero);
469: VecGetArrayRead(xx,&x);
470: VecGetArray(zz,&z);
472: v = a->a;
473: xb = x;
475: for (i=0; i<mbs; i++) {
476: n = ai[1] - ai[0]; /* length of i_th block row of A */
477: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
478: ib = aj + *ai;
479: jmin = 0;
480: nonzerorow += (n>0);
481: if (*ib == i) { /* (diag of A)*x */
482: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
483: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
484: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
485: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
486: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
487: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
488: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
489: v += 49; jmin++;
490: }
491: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
492: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493: for (j=jmin; j<n; j++) {
494: /* (strict lower triangular part of A)*x */
495: cval = ib[j]*7;
496: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
497: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
498: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
499: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
500: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
501: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
502: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
503: /* (strict upper triangular part of A)*x */
504: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
505: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
506: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
507: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
508: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
509: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
510: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
511: v += 49;
512: }
513: xb +=7; ai++;
514: }
515: VecRestoreArrayRead(xx,&x);
516: VecRestoreArray(zz,&z);
517: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
518: return(0);
519: }
521: /*
522: This will not work with MatScalar == float because it calls the BLAS
523: */
526: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
527: {
528: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
529: PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
530: const PetscScalar *x,*x_ptr,*xb;
531: const MatScalar *v;
532: PetscErrorCode ierr;
533: PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
534: const PetscInt *idx,*aj,*ii;
535: PetscInt nonzerorow=0;
538: VecSet(zz,zero);
539: VecGetArrayRead(xx,&x);x_ptr = x;
540: VecGetArray(zz,&z); z_ptr=z;
542: aj = a->j;
543: v = a->a;
544: ii = a->i;
546: if (!a->mult_work) {
547: PetscMalloc1(A->rmap->N+1,&a->mult_work);
548: }
549: work = a->mult_work;
551: for (i=0; i<mbs; i++) {
552: n = ii[1] - ii[0]; ncols = n*bs;
553: workt = work; idx=aj+ii[0];
554: nonzerorow += (n>0);
556: /* upper triangular part */
557: for (j=0; j<n; j++) {
558: xb = x_ptr + bs*(*idx++);
559: for (k=0; k<bs; k++) workt[k] = xb[k];
560: workt += bs;
561: }
562: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
563: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
565: /* strict lower triangular part */
566: idx = aj+ii[0];
567: if (*idx == i) {
568: ncols -= bs; v += bs2; idx++; n--;
569: }
571: if (ncols > 0) {
572: workt = work;
573: PetscMemzero(workt,ncols*sizeof(PetscScalar));
574: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
575: for (j=0; j<n; j++) {
576: zb = z_ptr + bs*(*idx++);
577: for (k=0; k<bs; k++) zb[k] += workt[k];
578: workt += bs;
579: }
580: }
581: x += bs; v += n*bs2; z += bs; ii++;
582: }
584: VecRestoreArrayRead(xx,&x);
585: VecRestoreArray(zz,&z);
586: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
587: return(0);
588: }
592: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
593: {
594: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
595: PetscScalar *z,x1;
596: const PetscScalar *x,*xb;
597: const MatScalar *v;
598: PetscErrorCode ierr;
599: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
600: const PetscInt *aj=a->j,*ai=a->i,*ib;
601: PetscInt nonzerorow=0;
604: VecCopy(yy,zz);
605: VecGetArrayRead(xx,&x);
606: VecGetArray(zz,&z);
607: v = a->a;
608: xb = x;
610: for (i=0; i<mbs; i++) {
611: n = ai[1] - ai[0]; /* length of i_th row of A */
612: x1 = xb[0];
613: ib = aj + *ai;
614: jmin = 0;
615: nonzerorow += (n>0);
616: if (*ib == i) { /* (diag of A)*x */
617: z[i] += *v++ * x[*ib++]; jmin++;
618: }
619: for (j=jmin; j<n; j++) {
620: cval = *ib;
621: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
622: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
623: }
624: xb++; ai++;
625: }
627: VecRestoreArrayRead(xx,&x);
628: VecRestoreArray(zz,&z);
630: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
631: return(0);
632: }
636: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
637: {
638: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
639: PetscScalar *z,x1,x2;
640: const PetscScalar *x,*xb;
641: const MatScalar *v;
642: PetscErrorCode ierr;
643: PetscInt mbs =a->mbs,i,n,cval,j,jmin;
644: const PetscInt *aj=a->j,*ai=a->i,*ib;
645: PetscInt nonzerorow=0;
648: VecCopy(yy,zz);
649: VecGetArrayRead(xx,&x);
650: VecGetArray(zz,&z);
652: v = a->a;
653: xb = x;
655: for (i=0; i<mbs; i++) {
656: n = ai[1] - ai[0]; /* length of i_th block row of A */
657: x1 = xb[0]; x2 = xb[1];
658: ib = aj + *ai;
659: jmin = 0;
660: nonzerorow += (n>0);
661: if (*ib == i) { /* (diag of A)*x */
662: z[2*i] += v[0]*x1 + v[2]*x2;
663: z[2*i+1] += v[2]*x1 + v[3]*x2;
664: v += 4; jmin++;
665: }
666: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
667: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
668: for (j=jmin; j<n; j++) {
669: /* (strict lower triangular part of A)*x */
670: cval = ib[j]*2;
671: z[cval] += v[0]*x1 + v[1]*x2;
672: z[cval+1] += v[2]*x1 + v[3]*x2;
673: /* (strict upper triangular part of A)*x */
674: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
675: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
676: v += 4;
677: }
678: xb +=2; ai++;
679: }
680: VecRestoreArrayRead(xx,&x);
681: VecRestoreArray(zz,&z);
683: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
684: return(0);
685: }
689: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
690: {
691: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
692: PetscScalar *z,x1,x2,x3;
693: const PetscScalar *x,*xb;
694: const MatScalar *v;
695: PetscErrorCode ierr;
696: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
697: const PetscInt *aj=a->j,*ai=a->i,*ib;
698: PetscInt nonzerorow=0;
701: VecCopy(yy,zz);
702: VecGetArrayRead(xx,&x);
703: VecGetArray(zz,&z);
705: v = a->a;
706: xb = x;
708: for (i=0; i<mbs; i++) {
709: n = ai[1] - ai[0]; /* length of i_th block row of A */
710: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
711: ib = aj + *ai;
712: jmin = 0;
713: nonzerorow += (n>0);
714: if (*ib == i) { /* (diag of A)*x */
715: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
716: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
717: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
718: v += 9; jmin++;
719: }
720: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
721: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
722: for (j=jmin; j<n; j++) {
723: /* (strict lower triangular part of A)*x */
724: cval = ib[j]*3;
725: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
726: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
727: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
728: /* (strict upper triangular part of A)*x */
729: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
730: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
731: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
732: v += 9;
733: }
734: xb +=3; ai++;
735: }
737: VecRestoreArrayRead(xx,&x);
738: VecRestoreArray(zz,&z);
740: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
741: return(0);
742: }
746: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
747: {
748: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
749: PetscScalar *z,x1,x2,x3,x4;
750: const PetscScalar *x,*xb;
751: const MatScalar *v;
752: PetscErrorCode ierr;
753: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
754: const PetscInt *aj=a->j,*ai=a->i,*ib;
755: PetscInt nonzerorow=0;
758: VecCopy(yy,zz);
759: VecGetArrayRead(xx,&x);
760: VecGetArray(zz,&z);
762: v = a->a;
763: xb = x;
765: for (i=0; i<mbs; i++) {
766: n = ai[1] - ai[0]; /* length of i_th block row of A */
767: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
768: ib = aj + *ai;
769: jmin = 0;
770: nonzerorow += (n>0);
771: if (*ib == i) { /* (diag of A)*x */
772: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
773: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
774: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
775: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
776: v += 16; jmin++;
777: }
778: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
779: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
780: for (j=jmin; j<n; j++) {
781: /* (strict lower triangular part of A)*x */
782: cval = ib[j]*4;
783: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
784: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
785: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
786: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
787: /* (strict upper triangular part of A)*x */
788: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
789: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
790: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
791: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
792: v += 16;
793: }
794: xb +=4; ai++;
795: }
797: VecRestoreArrayRead(xx,&x);
798: VecRestoreArray(zz,&z);
800: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
801: return(0);
802: }
806: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
807: {
808: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
809: PetscScalar *z,x1,x2,x3,x4,x5;
810: const PetscScalar *x,*xb;
811: const MatScalar *v;
812: PetscErrorCode ierr;
813: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
814: const PetscInt *aj=a->j,*ai=a->i,*ib;
815: PetscInt nonzerorow=0;
818: VecCopy(yy,zz);
819: VecGetArrayRead(xx,&x);
820: VecGetArray(zz,&z);
822: v = a->a;
823: xb = x;
825: for (i=0; i<mbs; i++) {
826: n = ai[1] - ai[0]; /* length of i_th block row of A */
827: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
828: ib = aj + *ai;
829: jmin = 0;
830: nonzerorow += (n>0);
831: if (*ib == i) { /* (diag of A)*x */
832: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
833: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
834: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
835: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
836: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
837: v += 25; jmin++;
838: }
839: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
840: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
841: for (j=jmin; j<n; j++) {
842: /* (strict lower triangular part of A)*x */
843: cval = ib[j]*5;
844: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
845: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
846: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
847: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
848: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
849: /* (strict upper triangular part of A)*x */
850: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
851: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
852: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
853: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
854: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
855: v += 25;
856: }
857: xb +=5; ai++;
858: }
860: VecRestoreArrayRead(xx,&x);
861: VecRestoreArray(zz,&z);
863: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
864: return(0);
865: }
868: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
869: {
870: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
871: PetscScalar *z,x1,x2,x3,x4,x5,x6;
872: const PetscScalar *x,*xb;
873: const MatScalar *v;
874: PetscErrorCode ierr;
875: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
876: const PetscInt *aj=a->j,*ai=a->i,*ib;
877: PetscInt nonzerorow=0;
880: VecCopy(yy,zz);
881: VecGetArrayRead(xx,&x);
882: VecGetArray(zz,&z);
884: v = a->a;
885: xb = x;
887: for (i=0; i<mbs; i++) {
888: n = ai[1] - ai[0]; /* length of i_th block row of A */
889: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
890: ib = aj + *ai;
891: jmin = 0;
892: nonzerorow += (n>0);
893: if (*ib == i) { /* (diag of A)*x */
894: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
895: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
896: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
897: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
898: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
899: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
900: v += 36; jmin++;
901: }
902: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
903: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
904: for (j=jmin; j<n; j++) {
905: /* (strict lower triangular part of A)*x */
906: cval = ib[j]*6;
907: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
908: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
909: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
910: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
911: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
912: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
913: /* (strict upper triangular part of A)*x */
914: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
915: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
916: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
917: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
918: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
919: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
920: v += 36;
921: }
922: xb +=6; ai++;
923: }
925: VecRestoreArrayRead(xx,&x);
926: VecRestoreArray(zz,&z);
928: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
929: return(0);
930: }
934: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
935: {
936: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
937: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
938: const PetscScalar *x,*xb;
939: const MatScalar *v;
940: PetscErrorCode ierr;
941: PetscInt mbs = a->mbs,i,n,cval,j,jmin;
942: const PetscInt *aj=a->j,*ai=a->i,*ib;
943: PetscInt nonzerorow=0;
946: VecCopy(yy,zz);
947: VecGetArrayRead(xx,&x);
948: VecGetArray(zz,&z);
950: v = a->a;
951: xb = x;
953: for (i=0; i<mbs; i++) {
954: n = ai[1] - ai[0]; /* length of i_th block row of A */
955: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
956: ib = aj + *ai;
957: jmin = 0;
958: nonzerorow += (n>0);
959: if (*ib == i) { /* (diag of A)*x */
960: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
961: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
962: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
963: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
964: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
965: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
966: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
967: v += 49; jmin++;
968: }
969: PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
970: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
971: for (j=jmin; j<n; j++) {
972: /* (strict lower triangular part of A)*x */
973: cval = ib[j]*7;
974: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
975: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
976: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
977: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
978: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
979: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
980: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
981: /* (strict upper triangular part of A)*x */
982: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
983: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
984: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
985: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
986: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
987: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
988: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
989: v += 49;
990: }
991: xb +=7; ai++;
992: }
994: VecRestoreArrayRead(xx,&x);
995: VecRestoreArray(zz,&z);
997: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
998: return(0);
999: }
1003: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1004: {
1005: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1006: PetscScalar *z,*z_ptr=0,*zb,*work,*workt;
1007: const PetscScalar *x,*x_ptr,*xb;
1008: const MatScalar *v;
1009: PetscErrorCode ierr;
1010: PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1011: const PetscInt *idx,*aj,*ii;
1012: PetscInt nonzerorow=0;
1015: VecCopy(yy,zz);
1016: VecGetArrayRead(xx,&x); x_ptr=x;
1017: VecGetArray(zz,&z); z_ptr=z;
1019: aj = a->j;
1020: v = a->a;
1021: ii = a->i;
1023: if (!a->mult_work) {
1024: PetscMalloc1(A->rmap->n+1,&a->mult_work);
1025: }
1026: work = a->mult_work;
1029: for (i=0; i<mbs; i++) {
1030: n = ii[1] - ii[0]; ncols = n*bs;
1031: workt = work; idx=aj+ii[0];
1032: nonzerorow += (n>0);
1034: /* upper triangular part */
1035: for (j=0; j<n; j++) {
1036: xb = x_ptr + bs*(*idx++);
1037: for (k=0; k<bs; k++) workt[k] = xb[k];
1038: workt += bs;
1039: }
1040: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1041: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1043: /* strict lower triangular part */
1044: idx = aj+ii[0];
1045: if (*idx == i) {
1046: ncols -= bs; v += bs2; idx++; n--;
1047: }
1048: if (ncols > 0) {
1049: workt = work;
1050: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1051: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1052: for (j=0; j<n; j++) {
1053: zb = z_ptr + bs*(*idx++);
1054: for (k=0; k<bs; k++) zb[k] += workt[k];
1055: workt += bs;
1056: }
1057: }
1059: x += bs; v += n*bs2; z += bs; ii++;
1060: }
1062: VecRestoreArrayRead(xx,&x);
1063: VecRestoreArray(zz,&z);
1065: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1066: return(0);
1067: }
1071: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1072: {
1073: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1074: PetscScalar oalpha = alpha;
1076: PetscBLASInt one = 1,totalnz;
1079: PetscBLASIntCast(a->bs2*a->nz,&totalnz);
1080: PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1081: PetscLogFlops(totalnz);
1082: return(0);
1083: }
1087: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1088: {
1089: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1090: const MatScalar *v = a->a;
1091: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1092: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1093: PetscErrorCode ierr;
1094: const PetscInt *aj=a->j,*col;
1097: if (type == NORM_FROBENIUS) {
1098: for (k=0; k<mbs; k++) {
1099: jmin = a->i[k]; jmax = a->i[k+1];
1100: col = aj + jmin;
1101: if (*col == k) { /* diagonal block */
1102: for (i=0; i<bs2; i++) {
1103: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1104: }
1105: jmin++;
1106: }
1107: for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1108: for (i=0; i<bs2; i++) {
1109: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1110: }
1111: }
1112: }
1113: *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1114: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1115: PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);
1116: for (i=0; i<mbs; i++) jl[i] = mbs;
1117: il[0] = 0;
1119: *norm = 0.0;
1120: for (k=0; k<mbs; k++) { /* k_th block row */
1121: for (j=0; j<bs; j++) sum[j]=0.0;
1122: /*-- col sum --*/
1123: i = jl[k]; /* first |A(i,k)| to be added */
1124: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1125: at step k */
1126: while (i<mbs) {
1127: nexti = jl[i]; /* next block row to be added */
1128: ik = il[i]; /* block index of A(i,k) in the array a */
1129: for (j=0; j<bs; j++) {
1130: v = a->a + ik*bs2 + j*bs;
1131: for (k1=0; k1<bs; k1++) {
1132: sum[j] += PetscAbsScalar(*v); v++;
1133: }
1134: }
1135: /* update il, jl */
1136: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1137: jmax = a->i[i+1];
1138: if (jmin < jmax) {
1139: il[i] = jmin;
1140: j = a->j[jmin];
1141: jl[i] = jl[j]; jl[j]=i;
1142: }
1143: i = nexti;
1144: }
1145: /*-- row sum --*/
1146: jmin = a->i[k]; jmax = a->i[k+1];
1147: for (i=jmin; i<jmax; i++) {
1148: for (j=0; j<bs; j++) {
1149: v = a->a + i*bs2 + j;
1150: for (k1=0; k1<bs; k1++) {
1151: sum[j] += PetscAbsScalar(*v); v += bs;
1152: }
1153: }
1154: }
1155: /* add k_th block row to il, jl */
1156: col = aj+jmin;
1157: if (*col == k) jmin++;
1158: if (jmin < jmax) {
1159: il[k] = jmin;
1160: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1161: }
1162: for (j=0; j<bs; j++) {
1163: if (sum[j] > *norm) *norm = sum[j];
1164: }
1165: }
1166: PetscFree3(sum,il,jl);
1167: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1168: return(0);
1169: }
1173: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1174: {
1175: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1179: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1180: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1181: *flg = PETSC_FALSE;
1182: return(0);
1183: }
1185: /* if the a->i are the same */
1186: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1187: if (!*flg) return(0);
1189: /* if a->j are the same */
1190: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1191: if (!*flg) return(0);
1193: /* if a->a are the same */
1194: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1195: return(0);
1196: }
1200: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1201: {
1202: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1203: PetscErrorCode ierr;
1204: PetscInt i,j,k,row,bs,ambs,bs2;
1205: const PetscInt *ai,*aj;
1206: PetscScalar *x,zero = 0.0;
1207: const MatScalar *aa,*aa_j;
1210: bs = A->rmap->bs;
1211: if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1213: aa = a->a;
1214: ambs = a->mbs;
1216: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1217: PetscInt *diag=a->diag;
1218: aa = a->a;
1219: ambs = a->mbs;
1220: VecGetArray(v,&x);
1221: for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1222: VecRestoreArray(v,&x);
1223: return(0);
1224: }
1226: ai = a->i;
1227: aj = a->j;
1228: bs2 = a->bs2;
1229: VecSet(v,zero);
1230: VecGetArray(v,&x);
1231: for (i=0; i<ambs; i++) {
1232: j=ai[i];
1233: if (aj[j] == i) { /* if this is a diagonal element */
1234: row = i*bs;
1235: aa_j = aa + j*bs2;
1236: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1237: }
1238: }
1239: VecRestoreArray(v,&x);
1240: return(0);
1241: }
1245: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1246: {
1247: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1248: PetscScalar x;
1249: const PetscScalar *l,*li,*ri;
1250: MatScalar *aa,*v;
1251: PetscErrorCode ierr;
1252: PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1253: PetscBool flg;
1256: if (ll != rr) {
1257: VecEqual(ll,rr,&flg);
1258: if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1259: }
1260: if (!ll) return(0);
1261: ai = a->i;
1262: aj = a->j;
1263: aa = a->a;
1264: m = A->rmap->N;
1265: bs = A->rmap->bs;
1266: mbs = a->mbs;
1267: bs2 = a->bs2;
1269: VecGetArrayRead(ll,&l);
1270: VecGetLocalSize(ll,&lm);
1271: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1272: for (i=0; i<mbs; i++) { /* for each block row */
1273: M = ai[i+1] - ai[i];
1274: li = l + i*bs;
1275: v = aa + bs2*ai[i];
1276: for (j=0; j<M; j++) { /* for each block */
1277: ri = l + bs*aj[ai[i]+j];
1278: for (k=0; k<bs; k++) {
1279: x = ri[k];
1280: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1281: }
1282: }
1283: }
1284: VecRestoreArrayRead(ll,&l);
1285: PetscLogFlops(2.0*a->nz);
1286: return(0);
1287: }
1291: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1292: {
1293: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1296: info->block_size = a->bs2;
1297: info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1298: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1299: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1300: info->assemblies = A->num_ass;
1301: info->mallocs = A->info.mallocs;
1302: info->memory = ((PetscObject)A)->mem;
1303: if (A->factortype) {
1304: info->fill_ratio_given = A->info.fill_ratio_given;
1305: info->fill_ratio_needed = A->info.fill_ratio_needed;
1306: info->factor_mallocs = A->info.factor_mallocs;
1307: } else {
1308: info->fill_ratio_given = 0;
1309: info->fill_ratio_needed = 0;
1310: info->factor_mallocs = 0;
1311: }
1312: return(0);
1313: }
1318: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1319: {
1320: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1324: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1325: return(0);
1326: }
1330: /*
1331: This code does not work since it only checks the upper triangular part of
1332: the matrix. Hence it is not listed in the function table.
1333: */
1334: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1335: {
1336: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1337: PetscErrorCode ierr;
1338: PetscInt i,j,n,row,col,bs,mbs;
1339: const PetscInt *ai,*aj;
1340: PetscReal atmp;
1341: const MatScalar *aa;
1342: PetscScalar *x;
1343: PetscInt ncols,brow,bcol,krow,kcol;
1346: if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1347: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1348: bs = A->rmap->bs;
1349: aa = a->a;
1350: ai = a->i;
1351: aj = a->j;
1352: mbs = a->mbs;
1354: VecSet(v,0.0);
1355: VecGetArray(v,&x);
1356: VecGetLocalSize(v,&n);
1357: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1358: for (i=0; i<mbs; i++) {
1359: ncols = ai[1] - ai[0]; ai++;
1360: brow = bs*i;
1361: for (j=0; j<ncols; j++) {
1362: bcol = bs*(*aj);
1363: for (kcol=0; kcol<bs; kcol++) {
1364: col = bcol + kcol; /* col index */
1365: for (krow=0; krow<bs; krow++) {
1366: atmp = PetscAbsScalar(*aa); aa++;
1367: row = brow + krow; /* row index */
1368: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1369: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1370: }
1371: }
1372: aj++;
1373: }
1374: }
1375: VecRestoreArray(v,&x);
1376: return(0);
1377: }