Actual source code: baij2.c
petsc-3.5.4 2015-05-23
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc-private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
6: #if defined(PETSC_THREADCOMM_ACTIVE)
7: #include <petscthreadcomm.h>
8: #endif
12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17: const PetscInt *idx;
18: PetscInt start,end,*ai,*aj,bs,*nidx2;
19: PetscBT table;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: PetscBTCreate(m,&table);
30: PetscMalloc1((m+1),&nidx);
31: PetscMalloc1((A->rmap->N+1),&nidx2);
33: for (i=0; i<is_max; i++) {
34: /* Initialise the two local arrays */
35: isz = 0;
36: PetscBTMemzero(m,table);
38: /* Extract the indices, assume there can be duplicate entries */
39: ISGetIndices(is[i],&idx);
40: ISGetLocalSize(is[i],&n);
42: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43: for (j=0; j<n; ++j) {
44: ival = idx[j]/bs; /* convert the indices into block indices */
45: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(&is[i]);
51: k = 0;
52: for (j=0; j<ov; j++) { /* for each overlap*/
53: n = isz;
54: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55: row = nidx[k];
56: start = ai[row];
57: end = ai[row+1];
58: for (l = start; l<end; l++) {
59: val = aj[l];
60: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61: }
62: }
63: }
64: /* expand the Index Set */
65: for (j=0; j<isz; j++) {
66: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67: }
68: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
69: }
70: PetscBTDestroy(&table);
71: PetscFree(nidx);
72: PetscFree(nidx2);
73: return(0);
74: }
78: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
79: {
80: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
82: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
83: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
84: const PetscInt *irow,*icol;
85: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
86: PetscInt *aj = a->j,*ai = a->i;
87: MatScalar *mat_a;
88: Mat C;
89: PetscBool flag,sorted;
92: ISSorted(iscol,&sorted);
93: if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
95: ISGetIndices(isrow,&irow);
96: ISGetIndices(iscol,&icol);
97: ISGetLocalSize(isrow,&nrows);
98: ISGetLocalSize(iscol,&ncols);
100: PetscCalloc1((1+oldcols),&smap);
101: ssmap = smap;
102: PetscMalloc1((1+nrows),&lens);
103: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104: /* determine lens of each row */
105: for (i=0; i<nrows; i++) {
106: kstart = ai[irow[i]];
107: kend = kstart + a->ilen[irow[i]];
108: lens[i] = 0;
109: for (k=kstart; k<kend; k++) {
110: if (ssmap[aj[k]]) lens[i]++;
111: }
112: }
113: /* Create and fill new matrix */
114: if (scall == MAT_REUSE_MATRIX) {
115: c = (Mat_SeqBAIJ*)((*B)->data);
117: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
118: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
119: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
120: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
121: C = *B;
122: } else {
123: MatCreate(PetscObjectComm((PetscObject)A),&C);
124: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
125: MatSetType(C,((PetscObject)A)->type_name);
126: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
127: }
128: c = (Mat_SeqBAIJ*)(C->data);
129: for (i=0; i<nrows; i++) {
130: row = irow[i];
131: kstart = ai[row];
132: kend = kstart + a->ilen[row];
133: mat_i = c->i[i];
134: mat_j = c->j + mat_i;
135: mat_a = c->a + mat_i*bs2;
136: mat_ilen = c->ilen + i;
137: for (k=kstart; k<kend; k++) {
138: if ((tcol=ssmap[a->j[k]])) {
139: *mat_j++ = tcol - 1;
140: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
141: mat_a += bs2;
142: (*mat_ilen)++;
143: }
144: }
145: }
147: /* Free work space */
148: ISRestoreIndices(iscol,&icol);
149: PetscFree(smap);
150: PetscFree(lens);
151: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
152: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
154: ISRestoreIndices(isrow,&irow);
155: *B = C;
156: return(0);
157: }
161: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
162: {
163: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
164: IS is1,is2;
166: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
167: const PetscInt *irow,*icol;
170: ISGetIndices(isrow,&irow);
171: ISGetIndices(iscol,&icol);
172: ISGetLocalSize(isrow,&nrows);
173: ISGetLocalSize(iscol,&ncols);
175: /* Verify if the indices corespond to each element in a block
176: and form the IS with compressed IS */
177: PetscMalloc2(a->mbs,&vary,a->mbs,&iary);
178: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
179: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
180: count = 0;
181: for (i=0; i<a->mbs; i++) {
182: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
183: if (vary[i]==bs) iary[count++] = i;
184: }
185: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
187: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
188: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
189: count = 0;
190: for (i=0; i<a->mbs; i++) {
191: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
192: if (vary[i]==bs) iary[count++] = i;
193: }
194: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
195: ISRestoreIndices(isrow,&irow);
196: ISRestoreIndices(iscol,&icol);
197: PetscFree2(vary,iary);
199: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
200: ISDestroy(&is1);
201: ISDestroy(&is2);
202: return(0);
203: }
207: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
208: {
210: PetscInt i;
213: if (scall == MAT_INITIAL_MATRIX) {
214: PetscMalloc1((n+1),B);
215: }
217: for (i=0; i<n; i++) {
218: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
219: }
220: return(0);
221: }
224: /* -------------------------------------------------------*/
225: /* Should check that shapes of vectors and matrices match */
226: /* -------------------------------------------------------*/
228: #if defined(PETSC_THREADCOMM_ACTIVE)
229: PetscErrorCode MatMult_SeqBAIJ_1_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
230: {
231: PetscErrorCode ierr;
232: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
233: PetscScalar *z;
234: const PetscScalar *x;
235: const MatScalar *aa;
236: PetscInt *trstarts=A->rmap->trstarts;
237: PetscInt n,start,end,i;
238: const PetscInt *aj,*ai;
239: PetscScalar sum;
241: VecGetArrayRead(xx,&x);
242: VecGetArray(zz,&z);
243: start = trstarts[thread_id];
244: end = trstarts[thread_id+1];
245: ai = a->i;
246: for (i=start; i<end; i++) {
247: n = ai[i+1] - ai[i];
248: aj = a->j + ai[i];
249: aa = a->a + ai[i];
250: sum = 0.0;
251: PetscSparseDensePlusDot(sum,x,aa,aj,n);
252: z[i] = sum;
253: }
254: VecRestoreArrayRead(xx,&x);
255: VecRestoreArray(zz,&z);
256: return 0;
257: }
261: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
262: {
263: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
264: PetscScalar *z,sum;
265: const PetscScalar *x;
266: const MatScalar *v;
267: PetscErrorCode ierr;
268: PetscInt mbs,i,n;
269: const PetscInt *idx,*ii,*ridx=NULL;
270: PetscBool usecprow=a->compressedrow.use;
274: if (usecprow) {
275: VecGetArrayRead(xx,&x);
276: VecGetArray(zz,&z);
277: mbs = a->compressedrow.nrows;
278: ii = a->compressedrow.i;
279: ridx = a->compressedrow.rindex;
280: PetscMemzero(z,mbs*sizeof(PetscScalar));
281: for (i=0; i<mbs; i++) {
282: n = ii[i+1] - ii[i];
283: v = a->a + ii[i];
284: idx = a->j + ii[i];
285: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
286: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
287: sum = 0.0;
288: PetscSparseDensePlusDot(sum,x,v,idx,n);
289: z[ridx[i]] = sum;
290: }
291: VecRestoreArrayRead(xx,&x);
292: VecRestoreArray(zz,&z);
293: } else {
294: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_1_Kernel,3,A,xx,zz);
295: }
296: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
297: return(0);
298: }
299: #else
302: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
303: {
304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
305: PetscScalar *z,sum;
306: const PetscScalar *x;
307: const MatScalar *v;
308: PetscErrorCode ierr;
309: PetscInt mbs,i,n;
310: const PetscInt *idx,*ii,*ridx=NULL;
311: PetscBool usecprow=a->compressedrow.use;
314: VecGetArrayRead(xx,&x);
315: VecGetArray(zz,&z);
317: if (usecprow) {
318: mbs = a->compressedrow.nrows;
319: ii = a->compressedrow.i;
320: ridx = a->compressedrow.rindex;
321: PetscMemzero(z,mbs*sizeof(PetscScalar));
322: } else {
323: mbs = a->mbs;
324: ii = a->i;
325: }
327: for (i=0; i<mbs; i++) {
328: n = ii[1] - ii[0];
329: v = a->a + ii[0];
330: idx = a->j + ii[0];
331: ii++;
332: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
333: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
334: sum = 0.0;
335: PetscSparseDensePlusDot(sum,x,v,idx,n);
336: if (usecprow) {
337: z[ridx[i]] = sum;
338: } else {
339: z[i] = sum;
340: }
341: }
342: VecRestoreArrayRead(xx,&x);
343: VecRestoreArray(zz,&z);
344: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
345: return(0);
346: }
347: #endif
349: #if defined(PETSC_THREADCOMM_ACTIVE)
350: PetscErrorCode MatMult_SeqBAIJ_2_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
351: {
352: PetscErrorCode ierr;
353: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
354: PetscScalar *z,x1,x2,sum1,sum2;
355: const PetscScalar *x,*xb;
356: const MatScalar *aa;
357: PetscInt *trstarts=A->rmap->trstarts;
358: PetscInt n,start,end,i,j;
359: const PetscInt *aj,*ai;
361: VecGetArrayRead(xx,&x);
362: VecGetArray(zz,&z);
363: start = trstarts[thread_id] / 2;
364: end = trstarts[thread_id+1] / 2;
365: ai = a->i;
366: for (i=start; i<end; i++) {
367: n = ai[i+1] - ai[i];
368: aj = a->j + ai[i];
369: aa = a->a + ai[i]*4;
370: sum1 = 0.0; sum2 = 0.0;
371: for (j=0; j<n; j++) {
372: xb = x + 2*aj[j]; x1 = xb[0]; x2 = xb[1];
373: sum1 += aa[4*j]*x1 + aa[4*j+2]*x2;
374: sum2 += aa[4*j+1]*x1 + aa[4*j+3]*x2;
375: }
376: z[2*i] = sum1; z[2*i+1] = sum2;
377: }
378: VecRestoreArrayRead(xx,&x);
379: VecRestoreArray(zz,&z);
380: return 0;
381: }
385: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
386: {
387: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
388: PetscScalar *z,x1,x2,sum1,sum2;
389: const PetscScalar *x,*xb;
390: const MatScalar *v;
391: PetscErrorCode ierr;
392: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
393: PetscBool usecprow=a->compressedrow.use;
397: if (usecprow) {
398: VecGetArrayRead(xx,&x);
399: VecGetArray(zz,&z);
400: mbs = a->compressedrow.nrows;
401: ii = a->compressedrow.i;
402: ridx = a->compressedrow.rindex;
403: for (i=0; i<mbs; i++) {
404: n = ii[i+1] - ii[i];
405: idx = a->j + ii[i];
406: v = a->a + ii[i]*4;
407: sum1 = 0.0; sum2 = 0.0;
408: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
409: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410: for (j=0; j<n; j++) {
411: xb = x + 2*idx[j]; x1 = xb[0]; x2 = xb[1];
412: sum1 += v[4*j]*x1 + v[4*j+2]*x2;
413: sum2 += v[4*j+1]*x1 + v[4*j+3]*x2;
414: }
415: z[2*ridx[i]] = sum1; z[2*ridx[i]+1] = sum2;
416: }
417: VecRestoreArrayRead(xx,&x);
418: VecRestoreArray(zz,&z);
419: } else {
420: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_2_Kernel,3,A,xx,zz);
421: }
422: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
423: return(0);
424: }
425: #else
428: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
429: {
430: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
431: PetscScalar *z = 0,sum1,sum2,*zarray;
432: const PetscScalar *x,*xb;
433: PetscScalar x1,x2;
434: const MatScalar *v;
435: PetscErrorCode ierr;
436: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
437: PetscBool usecprow=a->compressedrow.use;
440: VecGetArrayRead(xx,&x);
441: VecGetArray(zz,&zarray);
443: idx = a->j;
444: v = a->a;
445: if (usecprow) {
446: mbs = a->compressedrow.nrows;
447: ii = a->compressedrow.i;
448: ridx = a->compressedrow.rindex;
449: } else {
450: mbs = a->mbs;
451: ii = a->i;
452: z = zarray;
453: }
455: for (i=0; i<mbs; i++) {
456: n = ii[1] - ii[0]; ii++;
457: sum1 = 0.0; sum2 = 0.0;
458: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
459: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
460: for (j=0; j<n; j++) {
461: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
462: sum1 += v[0]*x1 + v[2]*x2;
463: sum2 += v[1]*x1 + v[3]*x2;
464: v += 4;
465: }
466: if (usecprow) z = zarray + 2*ridx[i];
467: z[0] = sum1; z[1] = sum2;
468: if (!usecprow) z += 2;
469: }
470: VecRestoreArrayRead(xx,&x);
471: VecRestoreArray(zz,&zarray);
472: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
473: return(0);
474: }
475: #endif
477: #if defined(PETSC_THREADCOMM_ACTIVE)
478: PetscErrorCode MatMult_SeqBAIJ_3_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
479: {
480: PetscErrorCode ierr;
481: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
482: PetscScalar *z,x1,x2,x3,sum1,sum2,sum3;
483: const PetscScalar *x,*xb;
484: const MatScalar *aa;
485: PetscInt *trstarts=A->rmap->trstarts;
486: PetscInt n,start,end,i,j;
487: const PetscInt *aj,*ai;
489: VecGetArrayRead(xx,&x);
490: VecGetArray(zz,&z);
491: start = trstarts[thread_id] / 3;
492: end = trstarts[thread_id+1] / 3;
493: ai = a->i;
494: for (i=start; i<end; i++) {
495: n = ai[i+1] - ai[i];
496: aj = a->j + ai[i];
497: aa = a->a + ai[i]*9;
498: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
499: for (j=0; j<n; j++) {
500: xb = x + 3*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
501: sum1 += aa[9*j]*x1 + aa[9*j+3]*x2 + aa[9*j+6]*x3;
502: sum2 += aa[9*j+1]*x1 + aa[9*j+4]*x2 + aa[9*j+7]*x3;
503: sum3 += aa[9*j+2]*x1 + aa[9*j+5]*x2 + aa[9*j+8]*x3;
504: }
505: z[3*i] = sum1; z[3*i+1] = sum2; z[3*i+2] = sum3;
506: }
507: VecRestoreArrayRead(xx,&x);
508: VecRestoreArray(zz,&z);
509: return 0;
510: }
514: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
517: PetscScalar *z,sum1,sum2,sum3,x1,x2,x3;
518: const PetscScalar *x,*xb;
519: const MatScalar *v;
520: PetscErrorCode ierr;
521: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
522: PetscBool usecprow=a->compressedrow.use;
525: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
526: #pragma disjoint(*v,*z,*xb)
527: #endif
531: if (usecprow) {
532: VecGetArrayRead(xx,&x);
533: VecGetArray(zz,&z);
534: mbs = a->compressedrow.nrows;
535: ii = a->compressedrow.i;
536: ridx = a->compressedrow.rindex;
537: for (i=0; i<mbs; i++) {
538: n = ii[i+1] - ii[i];
539: idx = a->j + ii[i];
540: v = a->a + ii[i]*9;
541: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
542: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
543: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
544: for (j=0; j<n; j++) {
545: xb = x + 3*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
546: sum1 += v[9*j]*x1 + v[9*j+3]*x2 + v[9*j+6]*x3;
547: sum2 += v[9*j+1]*x1 + v[9*j+4]*x2 + v[9*j+7]*x3;
548: sum3 += v[9*j+2]*x1 + v[9*j+5]*x2 + v[9*j+8]*x3;
549: }
550: z[3*ridx[i]] = sum1; z[3*ridx[i]+1] = sum2; z[3*ridx[i]+2] = sum3;
551: }
552: VecRestoreArrayRead(xx,&x);
553: VecRestoreArray(zz,&z);
554: } else {
555: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_3_Kernel,3,A,xx,zz);
556: }
557: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
558: return(0);
559: }
560: #else
563: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
564: {
565: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
566: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
567: const PetscScalar *x,*xb;
568: const MatScalar *v;
569: PetscErrorCode ierr;
570: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
571: PetscBool usecprow=a->compressedrow.use;
574: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
575: #pragma disjoint(*v,*z,*xb)
576: #endif
579: VecGetArrayRead(xx,&x);
580: VecGetArray(zz,&zarray);
582: idx = a->j;
583: v = a->a;
584: if (usecprow) {
585: mbs = a->compressedrow.nrows;
586: ii = a->compressedrow.i;
587: ridx = a->compressedrow.rindex;
588: } else {
589: mbs = a->mbs;
590: ii = a->i;
591: z = zarray;
592: }
594: for (i=0; i<mbs; i++) {
595: n = ii[1] - ii[0]; ii++;
596: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
597: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
598: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
599: for (j=0; j<n; j++) {
600: xb = x + 3*(*idx++);
601: x1 = xb[0];
602: x2 = xb[1];
603: x3 = xb[2];
605: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
606: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
607: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
608: v += 9;
609: }
610: if (usecprow) z = zarray + 3*ridx[i];
611: z[0] = sum1; z[1] = sum2; z[2] = sum3;
612: if (!usecprow) z += 3;
613: }
614: VecRestoreArrayRead(xx,&x);
615: VecRestoreArray(zz,&zarray);
616: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
617: return(0);
618: }
619: #endif
621: #if defined(PETSC_THREADCOMM_ACTIVE)
622: PetscErrorCode MatMult_SeqBAIJ_4_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
623: {
624: PetscErrorCode ierr;
625: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
626: PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
627: const PetscScalar *x,*xb;
628: const MatScalar *aa;
629: PetscInt *trstarts=A->rmap->trstarts;
630: PetscInt n,start,end,i,j;
631: const PetscInt *aj,*ai;
633: VecGetArrayRead(xx,&x);
634: VecGetArray(zz,&z);
635: start = trstarts[thread_id] / 4;
636: end = trstarts[thread_id+1] / 4;
637: ai = a->i;
638: for (i=start; i<end; i++) {
639: n = ai[i+1] - ai[i];
640: aj = a->j + ai[i];
641: aa = a->a + ai[i]*16;
642: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
643: for (j=0; j<n; j++) {
644: xb = x + 4*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
645: sum1 += aa[16*j]*x1 + aa[16*j+4]*x2 + aa[16*j+8]*x3 + aa[16*j+12]*x4;
646: sum2 += aa[16*j+1]*x1 + aa[16*j+5]*x2 + aa[16*j+9]*x3 + aa[16*j+13]*x4;
647: sum3 += aa[16*j+2]*x1 + aa[16*j+6]*x2 + aa[16*j+10]*x3 + aa[16*j+14]*x4;
648: sum4 += aa[16*j+3]*x1 + aa[16*j+7]*x2 + aa[16*j+11]*x3 + aa[16*j+15]*x4;
649: }
650: z[4*i] = sum1; z[4*i+1] = sum2;
651: z[4*i+2] = sum3; z[4*i+3] = sum4;
652: }
653: VecRestoreArrayRead(xx,&x);
654: VecRestoreArray(zz,&z);
655: return 0;
656: }
660: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
661: {
662: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
663: PetscScalar *z,x1,x2,x3,x4,sum1,sum2,sum3,sum4;
664: const PetscScalar *x,*xb;
665: const MatScalar *v;
666: PetscErrorCode ierr;
667: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
668: PetscBool usecprow=a->compressedrow.use;
672: if (usecprow) {
673: VecGetArrayRead(xx,&x);
674: VecGetArray(zz,&z);
675: mbs = a->compressedrow.nrows;
676: ii = a->compressedrow.i;
677: ridx = a->compressedrow.rindex;
678: for (i=0; i<mbs; i++) {
679: n = ii[i+1] - ii[1];
680: idx = a->j + ii[i];
681: v = a->a + ii[i]*16;
682: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
683: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
684: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
685: for (j=0; j<n; j++) {
686: xb = x + 4*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
687: sum1 += v[16*j]*x1 + v[16*j+4]*x2 + v[16*j+8]*x3 + v[16*j+12]*x4;
688: sum2 += v[16*j+1]*x1 + v[16*j+5]*x2 + v[16*j+9]*x3 + v[16*j+13]*x4;
689: sum3 += v[16*j+2]*x1 + v[16*j+6]*x2 + v[16*j+10]*x3 + v[16*j+14]*x4;
690: sum4 += v[16*j+3]*x1 + v[16*j+7]*x2 + v[16*j+11]*x3 + v[16*j+15]*x4;
691: }
692: z[4*ridx[i]] = sum1; z[4*ridx[i]+1] = sum2;
693: z[4*ridx[i]+2] = sum3; z[4*ridx[i]+3] = sum4;
694: }
695: VecRestoreArrayRead(xx,&x);
696: VecRestoreArray(zz,&z);
697: } else {
698: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_4_Kernel,3,A,xx,zz);
699: }
700: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
701: return(0);
702: }
703: #else
706: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
707: {
708: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
709: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
710: const PetscScalar *x,*xb;
711: const MatScalar *v;
712: PetscErrorCode ierr;
713: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
714: PetscBool usecprow=a->compressedrow.use;
717: VecGetArrayRead(xx,&x);
718: VecGetArray(zz,&zarray);
720: idx = a->j;
721: v = a->a;
722: if (usecprow) {
723: mbs = a->compressedrow.nrows;
724: ii = a->compressedrow.i;
725: ridx = a->compressedrow.rindex;
726: } else {
727: mbs = a->mbs;
728: ii = a->i;
729: z = zarray;
730: }
732: for (i=0; i<mbs; i++) {
733: n = ii[1] - ii[0];
734: ii++;
735: sum1 = 0.0;
736: sum2 = 0.0;
737: sum3 = 0.0;
738: sum4 = 0.0;
740: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
741: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
742: for (j=0; j<n; j++) {
743: xb = x + 4*(*idx++);
744: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
745: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
746: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
747: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
748: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
749: v += 16;
750: }
751: if (usecprow) z = zarray + 4*ridx[i];
752: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
753: if (!usecprow) z += 4;
754: }
755: VecRestoreArrayRead(xx,&x);
756: VecRestoreArray(zz,&zarray);
757: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
758: return(0);
759: }
760: #endif
762: #if defined(PETSC_THREADCOMM_ACTIVE)
763: PetscErrorCode MatMult_SeqBAIJ_5_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
764: {
765: PetscErrorCode ierr;
766: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
767: PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
768: const PetscScalar *x,*xb;
769: const MatScalar *aa;
770: PetscInt *trstarts=A->rmap->trstarts;
771: PetscInt n,start,end,i,j;
772: const PetscInt *aj,*ai;
774: VecGetArrayRead(xx,&x);
775: VecGetArray(zz,&z);
776: start = trstarts[thread_id] / 5;
777: end = trstarts[thread_id+1] / 5;
778: ai = a->i;
779: for (i=start; i<end; i++) {
780: n = ai[i+1] - ai[i];
781: aj = a->j + ai[i];
782: aa = a->a + ai[i]*25;
783: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
784: for (j=0; j<n; j++) {
785: xb = x + 5*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
786: sum1 += aa[25*j]*x1 + aa[25*j+5]*x2 + aa[25*j+10]*x3 + aa[25*j+15]*x4 + aa[25*j+20]*x5;
787: sum2 += aa[25*j+1]*x1 + aa[25*j+6]*x2 + aa[25*j+11]*x3 + aa[25*j+16]*x4 + aa[25*j+21]*x5;
788: sum3 += aa[25*j+2]*x1 + aa[25*j+7]*x2 + aa[25*j+12]*x3 + aa[25*j+17]*x4 + aa[25*j+22]*x5;
789: sum4 += aa[25*j+3]*x1 + aa[25*j+8]*x2 + aa[25*j+13]*x3 + aa[25*j+18]*x4 + aa[25*j+23]*x5;
790: sum5 += aa[25*j+4]*x1 + aa[25*j+9]*x2 + aa[25*j+14]*x3 + aa[25*j+19]*x4 + aa[25*j+24]*x5;
791: }
792: z[5*i] = sum1; z[5*i+1] = sum2; z[5*i+2] = sum3;
793: z[5*i+3] = sum4; z[5*i+4] = sum5;
794: }
795: VecRestoreArrayRead(xx,&x);
796: VecRestoreArray(zz,&z);
797: return 0;
798: }
802: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
803: {
804: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
805: PetscScalar *z,x1,x2,x3,x4,x5,sum1,sum2,sum3,sum4,sum5;
806: const PetscScalar *xb,*x;
807: const MatScalar *v;
808: PetscErrorCode ierr;
809: const PetscInt *idx,*ii,*ridx=NULL;
810: PetscInt mbs,i,j,n;
811: PetscBool usecprow=a->compressedrow.use;
815: if (usecprow) {
816: VecGetArrayRead(xx,&x);
817: VecGetArray(zz,&z);
818: mbs = a->compressedrow.nrows;
819: ii = a->compressedrow.i;
820: ridx = a->compressedrow.rindex;
821: for (i=0; i<mbs; i++) {
822: n = ii[i+1] - ii[i];
823: idx = a->j + ii[i];
824: v = a->a + ii[i]*25;
825: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
826: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
827: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
828: for (j=0; j<n; j++) {
829: xb = x + 5*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
830: sum1 += v[25*j]*x1 + v[25*j+5]*x2 + v[25*j+10]*x3 + v[25*j+15]*x4 + v[25*j+20]*x5;
831: sum2 += v[25*j+1]*x1 + v[25*j+6]*x2 + v[25*j+11]*x3 + v[25*j+16]*x4 + v[25*j+21]*x5;
832: sum3 += v[25*j+2]*x1 + v[25*j+7]*x2 + v[25*j+12]*x3 + v[25*j+17]*x4 + v[25*j+22]*x5;
833: sum4 += v[25*j+3]*x1 + v[25*j+8]*x2 + v[25*j+13]*x3 + v[25*j+18]*x4 + v[25*j+23]*x5;
834: sum5 += v[25*j+4]*x1 + v[25*j+9]*x2 + v[25*j+14]*x3 + v[25*j+19]*x4 + v[25*j+24]*x5;
835: }
836: z[5*ridx[i]] = sum1; z[5*ridx[i]+1] = sum2; z[5*ridx[i]+2] = sum3;
837: z[5*ridx[i]+3] = sum4; z[5*ridx[i]+4] = sum5;
838: }
839: VecRestoreArrayRead(xx,&x);
840: VecRestoreArray(zz,&z);
841: } else {
842: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_5_Kernel,3,A,xx,zz);
843: }
844: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
845: return(0);
846: }
847: #else
850: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
851: {
852: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
853: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
854: const PetscScalar *xb,*x;
855: const MatScalar *v;
856: PetscErrorCode ierr;
857: const PetscInt *idx,*ii,*ridx=NULL;
858: PetscInt mbs,i,j,n;
859: PetscBool usecprow=a->compressedrow.use;
862: VecGetArrayRead(xx,&x);
863: VecGetArray(zz,&zarray);
865: idx = a->j;
866: v = a->a;
867: if (usecprow) {
868: mbs = a->compressedrow.nrows;
869: ii = a->compressedrow.i;
870: ridx = a->compressedrow.rindex;
871: } else {
872: mbs = a->mbs;
873: ii = a->i;
874: z = zarray;
875: }
877: for (i=0; i<mbs; i++) {
878: n = ii[1] - ii[0]; ii++;
879: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
880: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
881: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
882: for (j=0; j<n; j++) {
883: xb = x + 5*(*idx++);
884: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
885: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
886: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
887: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
888: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
889: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
890: v += 25;
891: }
892: if (usecprow) z = zarray + 5*ridx[i];
893: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
894: if (!usecprow) z += 5;
895: }
896: VecRestoreArrayRead(xx,&x);
897: VecRestoreArray(zz,&zarray);
898: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
899: return(0);
900: }
901: #endif
904: #if defined(PETSC_THREADCOMM_ACTIVE)
905: PetscErrorCode MatMult_SeqBAIJ_6_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
906: {
907: PetscErrorCode ierr;
908: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
909: PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
910: const PetscScalar *x,*xb;
911: const MatScalar *aa;
912: PetscInt *trstarts=A->rmap->trstarts;
913: PetscInt n,start,end,i,j;
914: const PetscInt *aj,*ai;
916: VecGetArrayRead(xx,&x);
917: VecGetArray(zz,&z);
918: start = trstarts[thread_id] / 6;
919: end = trstarts[thread_id+1] / 6;
920: ai = a->i;
921: for (i=start; i<end; i++) {
922: n = ai[i+1] - ai[i];
923: aj = a->j + ai[i];
924: aa = a->a + ai[i]*36;
925: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
926: for (j=0; j<n; j++) {
927: xb = x + 6*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
928: sum1 += aa[36*j]*x1 + aa[36*j+6]*x2 + aa[36*j+12]*x3 + aa[36*j+18]*x4 + aa[36*j+24]*x5 + aa[36*j+30]*x6;
929: sum2 += aa[36*j+1]*x1 + aa[36*j+7]*x2 + aa[36*j+13]*x3 + aa[36*j+19]*x4 + aa[36*j+25]*x5 + aa[36*j+31]*x6;
930: sum3 += aa[36*j+2]*x1 + aa[36*j+8]*x2 + aa[36*j+14]*x3 + aa[36*j+20]*x4 + aa[36*j+26]*x5 + aa[36*j+32]*x6;
931: sum4 += aa[36*j+3]*x1 + aa[36*j+9]*x2 + aa[36*j+15]*x3 + aa[36*j+21]*x4 + aa[36*j+27]*x5 + aa[36*j+33]*x6;
932: sum5 += aa[36*j+4]*x1 + aa[36*j+10]*x2 + aa[36*j+16]*x3 + aa[36*j+22]*x4 + aa[36*j+28]*x5 + aa[36*j+34]*x6;
933: sum6 += aa[36*j+5]*x1 + aa[36*j+11]*x2 + aa[36*j+17]*x3 + aa[36*j+23]*x4 + aa[36*j+29]*x5 + aa[36*j+35]*x6;
934: }
935: z[6*i] = sum1; z[6*i+1] = sum2; z[6*i+2] = sum3;
936: z[6*i+3] = sum4; z[6*i+4] = sum5; z[6*i+5] = sum6;
937: }
938: VecRestoreArrayRead(xx,&x);
939: VecRestoreArray(zz,&z);
940: return 0;
941: }
945: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
946: {
947: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
948: PetscScalar *z,x1,x2,x3,x4,x5,x6,sum1,sum2,sum3,sum4,sum5,sum6;
949: const PetscScalar *x,*xb;
950: const MatScalar *v;
951: PetscErrorCode ierr;
952: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
953: PetscBool usecprow=a->compressedrow.use;
957: if (usecprow) {
958: VecGetArrayRead(xx,&x);
959: VecGetArray(zz,&z);
960: mbs = a->compressedrow.nrows;
961: ii = a->compressedrow.i;
962: ridx = a->compressedrow.rindex;
963: for (i=0; i<mbs; i++) {
964: n = ii[i+1] - ii[i];
965: idx = a->j + ii[i];
966: v = a->a + ii[i]*36;
967: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
968: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
969: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
970: for (j=0; j<n; j++) {
971: xb = x + 6*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
972: sum1 += v[36*j]*x1 + v[36*j+6]*x2 + v[36*j+12]*x3 + v[36*j+18]*x4 + v[36*j+24]*x5 + v[36*j+30]*x6;
973: sum2 += v[36*j+1]*x1 + v[36*j+7]*x2 + v[36*j+13]*x3 + v[36*j+19]*x4 + v[36*j+25]*x5 + v[36*j+31]*x6;
974: sum3 += v[36*j+2]*x1 + v[36*j+8]*x2 + v[36*j+14]*x3 + v[36*j+20]*x4 + v[36*j+26]*x5 + v[36*j+32]*x6;
975: sum4 += v[36*j+3]*x1 + v[36*j+9]*x2 + v[36*j+15]*x3 + v[36*j+21]*x4 + v[36*j+27]*x5 + v[36*j+33]*x6;
976: sum5 += v[36*j+4]*x1 + v[36*j+10]*x2 + v[36*j+16]*x3 + v[36*j+22]*x4 + v[36*j+28]*x5 + v[36*j+34]*x6;
977: sum6 += v[36*j+5]*x1 + v[36*j+11]*x2 + v[36*j+17]*x3 + v[36*j+23]*x4 + v[36*j+29]*x5 + v[36*j+35]*x6;
978: }
979: z[6*ridx[i]] = sum1; z[6*ridx[i]+1] = sum2; z[6*ridx[i]+2] = sum3;
980: z[6*ridx[i]+3] = sum4; z[6*ridx[i]+4] = sum5; z[6*ridx[i]+5] = sum6;
981: }
982: VecRestoreArrayRead(xx,&x);
983: VecRestoreArray(zz,&z);
984: } else {
985: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_6_Kernel,3,A,xx,zz);
986: }
987: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
988: return(0);
989: }
990: #else
993: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
994: {
995: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
996: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
997: const PetscScalar *x,*xb;
998: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
999: const MatScalar *v;
1000: PetscErrorCode ierr;
1001: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
1002: PetscBool usecprow=a->compressedrow.use;
1005: VecGetArrayRead(xx,&x);
1006: VecGetArray(zz,&zarray);
1008: idx = a->j;
1009: v = a->a;
1010: if (usecprow) {
1011: mbs = a->compressedrow.nrows;
1012: ii = a->compressedrow.i;
1013: ridx = a->compressedrow.rindex;
1014: } else {
1015: mbs = a->mbs;
1016: ii = a->i;
1017: z = zarray;
1018: }
1020: for (i=0; i<mbs; i++) {
1021: n = ii[1] - ii[0];
1022: ii++;
1023: sum1 = 0.0;
1024: sum2 = 0.0;
1025: sum3 = 0.0;
1026: sum4 = 0.0;
1027: sum5 = 0.0;
1028: sum6 = 0.0;
1030: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1031: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1032: for (j=0; j<n; j++) {
1033: xb = x + 6*(*idx++);
1034: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1035: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1036: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1037: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1038: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1039: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1040: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1041: v += 36;
1042: }
1043: if (usecprow) z = zarray + 6*ridx[i];
1044: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1045: if (!usecprow) z += 6;
1046: }
1048: VecRestoreArrayRead(xx,&x);
1049: VecRestoreArray(zz,&zarray);
1050: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
1051: return(0);
1052: }
1053: #endif
1055: #if defined(PETSC_THREADCOMM_ACTIVE)
1056: PetscErrorCode MatMult_SeqBAIJ_7_Kernel(PetscInt thread_id,Mat A,Vec xx,Vec zz)
1057: {
1058: PetscErrorCode ierr;
1059: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1060: PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1061: const PetscScalar *x,*xb;
1062: const MatScalar *aa;
1063: PetscInt *trstarts=A->rmap->trstarts;
1064: PetscInt n,start,end,i,j;
1065: const PetscInt *aj,*ai;
1067: VecGetArrayRead(xx,&x);
1068: VecGetArray(zz,&z);
1069: start = trstarts[thread_id] / 7;
1070: end = trstarts[thread_id+1] / 7;
1071: ai = a->i;
1072: for (i=start; i<end; i++) {
1073: n = ai[i+1] - ai[i];
1074: aj = a->j + ai[i];
1075: aa = a->a + ai[i]*49;
1076: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1077: for (j=0; j<n; j++) {
1078: xb = x + 7*aj[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1079: sum1 += aa[49*j]*x1 + aa[49*j+7]*x2 + aa[49*j+14]*x3 + aa[49*j+21]*x4 + aa[49*j+28]*x5 + aa[49*j+35]*x6 + aa[49*j+42]*x7;
1080: sum2 += aa[49*j+1]*x1 + aa[49*j+8]*x2 + aa[49*j+15]*x3 + aa[49*j+22]*x4 + aa[49*j+29]*x5 + aa[49*j+36]*x6 + aa[49*j+43]*x7;
1081: sum3 += aa[49*j+2]*x1 + aa[49*j+9]*x2 + aa[49*j+16]*x3 + aa[49*j+23]*x4 + aa[49*j+30]*x5 + aa[49*j+37]*x6 + aa[49*j+44]*x7;
1082: sum4 += aa[49*j+3]*x1 + aa[49*j+10]*x2 + aa[49*j+17]*x3 + aa[49*j+24]*x4 + aa[49*j+31]*x5 + aa[49*j+38]*x6 + aa[49*j+45]*x7;
1083: sum5 += aa[49*j+4]*x1 + aa[49*j+11]*x2 + aa[49*j+18]*x3 + aa[49*j+25]*x4 + aa[49*j+32]*x5 + aa[49*j+39]*x6 + aa[49*j+46]*x7;
1084: sum6 += aa[49*j+5]*x1 + aa[49*j+12]*x2 + aa[49*j+19]*x3 + aa[49*j+26]*x4 + aa[49*j+33]*x5 + aa[49*j+40]*x6 + aa[49*j+47]*x7;
1085: sum7 += aa[49*j+6]*x1 + aa[49*j+13]*x2 + aa[49*j+20]*x3 + aa[49*j+27]*x4 + aa[49*j+34]*x5 + aa[49*j+41]*x6 + aa[49*j+48]*x7;
1086: }
1087: z[7*i] = sum1; z[7*i+1] = sum2; z[7*i+2] = sum3; z[7*i+3] = sum4;
1088: z[7*i+4] = sum5; z[7*i+5] = sum6; z[7*i+6] = sum7;
1089: }
1090: VecRestoreArrayRead(xx,&x);
1091: VecRestoreArray(zz,&z);
1092: return 0;
1093: }
1097: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1098: {
1099: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1100: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1101: const PetscScalar *x,*xb;
1102: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
1103: const MatScalar *v;
1104: PetscErrorCode ierr;
1105: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
1106: PetscBool usecprow=a->compressedrow.use;
1110: if (usecprow) {
1111: VecGetArrayRead(xx,&x);
1112: VecGetArray(zz,&zarray);
1113: mbs = a->compressedrow.nrows;
1114: ii = a->compressedrow.i;
1115: ridx = a->compressedrow.rindex;
1116: for (i=0; i<mbs; i++) {
1117: n = ii[i+1] - ii[i];
1118: idx = a->j + ii[i];
1119: v = a->a + ii[i]*49;
1120: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1121: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1122: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1123: for (j=0; j<n; j++) {
1124: xb = x + 7*idx[j]; x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1125: sum1 += v[49*j]*x1 + v[49*j+7]*x2 + v[49*j+14]*x3 + v[49*j+21]*x4 + v[49*j+28]*x5 + v[49*j+35]*x6 + v[49*j+42]*x7;
1126: sum2 += v[49*j+1]*x1 + v[49*j+8]*x2 + v[49*j+15]*x3 + v[49*j+22]*x4 + v[49*j+29]*x5 + v[49*j+36]*x6 + v[49*j+43]*x7;
1127: sum3 += v[49*j+2]*x1 + v[49*j+9]*x2 + v[49*j+16]*x3 + v[49*j+23]*x4 + v[49*j+30]*x5 + v[49*j+37]*x6 + v[49*j+44]*x7;
1128: sum4 += v[49*j+3]*x1 + v[49*j+10]*x2 + v[49*j+17]*x3 + v[49*j+24]*x4 + v[49*j+31]*x5 + v[49*j+38]*x6 + v[49*j+45]*x7;
1129: sum5 += v[49*j+4]*x1 + v[49*j+11]*x2 + v[49*j+18]*x3 + v[49*j+25]*x4 + v[49*j+32]*x5 + v[49*j+39]*x6 + v[49*j+46]*x7;
1130: sum6 += v[49*j+5]*x1 + v[49*j+12]*x2 + v[49*j+19]*x3 + v[49*j+26]*x4 + v[49*j+33]*x5 + v[49*j+40]*x6 + v[49*j+47]*x7;
1131: sum7 += v[49*j+6]*x1 + v[49*j+13]*x2 + v[49*j+20]*x3 + v[49*j+27]*x4 + v[49*j+34]*x5 + v[49*j+41]*x6 + v[49*j+48]*x7;
1132: }
1133: z[7*ridx[i]] = sum1; z[7*ridx[i]+1] = sum2; z[7*ridx[i]+2] = sum3; z[7*ridx[i]+3] = sum4;
1134: z[7*ridx[i]+4] = sum5; z[7*ridx[i]+5] = sum6; z[7*ridx[i]+6] = sum7;
1135: }
1136: VecRestoreArrayRead(xx,&x);
1137: VecRestoreArray(zz,&zarray);
1138: } else {
1139: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatMult_SeqBAIJ_7_Kernel,3,A,xx,zz);
1140: }
1141: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
1142: return(0);
1143: }
1144: #else
1147: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
1148: {
1149: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1150: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1151: const PetscScalar *x,*xb;
1152: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
1153: const MatScalar *v;
1154: PetscErrorCode ierr;
1155: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
1156: PetscBool usecprow=a->compressedrow.use;
1159: VecGetArrayRead(xx,&x);
1160: VecGetArray(zz,&zarray);
1162: idx = a->j;
1163: v = a->a;
1164: if (usecprow) {
1165: mbs = a->compressedrow.nrows;
1166: ii = a->compressedrow.i;
1167: ridx = a->compressedrow.rindex;
1168: } else {
1169: mbs = a->mbs;
1170: ii = a->i;
1171: z = zarray;
1172: }
1174: for (i=0; i<mbs; i++) {
1175: n = ii[1] - ii[0];
1176: ii++;
1177: sum1 = 0.0;
1178: sum2 = 0.0;
1179: sum3 = 0.0;
1180: sum4 = 0.0;
1181: sum5 = 0.0;
1182: sum6 = 0.0;
1183: sum7 = 0.0;
1185: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1186: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1187: for (j=0; j<n; j++) {
1188: xb = x + 7*(*idx++);
1189: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1190: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1191: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1192: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1193: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1194: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1195: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1196: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1197: v += 49;
1198: }
1199: if (usecprow) z = zarray + 7*ridx[i];
1200: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1201: if (!usecprow) z += 7;
1202: }
1204: VecRestoreArrayRead(xx,&x);
1205: VecRestoreArray(zz,&zarray);
1206: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
1207: return(0);
1208: }
1209: #endif
1211: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1212: /* Default MatMult for block size 15 */
1216: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1217: {
1218: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1219: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1220: const PetscScalar *x,*xb;
1221: PetscScalar *zarray,xv;
1222: const MatScalar *v;
1223: PetscErrorCode ierr;
1224: const PetscInt *ii,*ij=a->j,*idx;
1225: PetscInt mbs,i,j,k,n,*ridx=NULL;
1226: PetscBool usecprow=a->compressedrow.use;
1229: VecGetArrayRead(xx,&x);
1230: VecGetArray(zz,&zarray);
1232: v = a->a;
1233: if (usecprow) {
1234: mbs = a->compressedrow.nrows;
1235: ii = a->compressedrow.i;
1236: ridx = a->compressedrow.rindex;
1237: } else {
1238: mbs = a->mbs;
1239: ii = a->i;
1240: z = zarray;
1241: }
1243: for (i=0; i<mbs; i++) {
1244: n = ii[i+1] - ii[i];
1245: idx = ij + ii[i];
1246: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1247: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1249: for (j=0; j<n; j++) {
1250: xb = x + 15*(idx[j]);
1252: for (k=0; k<15; k++) {
1253: xv = xb[k];
1254: sum1 += v[0]*xv;
1255: sum2 += v[1]*xv;
1256: sum3 += v[2]*xv;
1257: sum4 += v[3]*xv;
1258: sum5 += v[4]*xv;
1259: sum6 += v[5]*xv;
1260: sum7 += v[6]*xv;
1261: sum8 += v[7]*xv;
1262: sum9 += v[8]*xv;
1263: sum10 += v[9]*xv;
1264: sum11 += v[10]*xv;
1265: sum12 += v[11]*xv;
1266: sum13 += v[12]*xv;
1267: sum14 += v[13]*xv;
1268: sum15 += v[14]*xv;
1269: v += 15;
1270: }
1271: }
1272: if (usecprow) z = zarray + 15*ridx[i];
1273: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1274: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1276: if (!usecprow) z += 15;
1277: }
1279: VecRestoreArrayRead(xx,&x);
1280: VecRestoreArray(zz,&zarray);
1281: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1282: return(0);
1283: }
1285: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1288: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1289: {
1290: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1291: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1292: const PetscScalar *x,*xb;
1293: PetscScalar x1,x2,x3,x4,*zarray;
1294: const MatScalar *v;
1295: PetscErrorCode ierr;
1296: const PetscInt *ii,*ij=a->j,*idx;
1297: PetscInt mbs,i,j,n,*ridx=NULL;
1298: PetscBool usecprow=a->compressedrow.use;
1301: VecGetArrayRead(xx,&x);
1302: VecGetArray(zz,&zarray);
1304: v = a->a;
1305: if (usecprow) {
1306: mbs = a->compressedrow.nrows;
1307: ii = a->compressedrow.i;
1308: ridx = a->compressedrow.rindex;
1309: } else {
1310: mbs = a->mbs;
1311: ii = a->i;
1312: z = zarray;
1313: }
1315: for (i=0; i<mbs; i++) {
1316: n = ii[i+1] - ii[i];
1317: idx = ij + ii[i];
1318: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1319: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1321: for (j=0; j<n; j++) {
1322: xb = x + 15*(idx[j]);
1323: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1325: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1326: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1327: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1328: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1329: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1330: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1331: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1332: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1333: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1334: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1335: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1336: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1337: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1338: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1339: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1341: v += 60;
1343: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1345: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1346: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1347: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1348: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1349: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1350: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1351: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1352: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1353: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1354: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1355: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1356: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1357: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1358: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1359: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1360: v += 60;
1362: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1363: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1364: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1365: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1366: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1367: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1368: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1369: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1370: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1371: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1372: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1373: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1374: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1375: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1376: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1377: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1378: v += 60;
1380: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1381: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1382: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1383: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1384: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1385: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1386: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1387: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1388: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1389: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1390: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1391: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1392: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1393: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1394: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1395: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1396: v += 45;
1397: }
1398: if (usecprow) z = zarray + 15*ridx[i];
1399: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1400: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1402: if (!usecprow) z += 15;
1403: }
1405: VecRestoreArrayRead(xx,&x);
1406: VecRestoreArray(zz,&zarray);
1407: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1408: return(0);
1409: }
1411: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1414: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1415: {
1416: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1417: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1418: const PetscScalar *x,*xb;
1419: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1420: const MatScalar *v;
1421: PetscErrorCode ierr;
1422: const PetscInt *ii,*ij=a->j,*idx;
1423: PetscInt mbs,i,j,n,*ridx=NULL;
1424: PetscBool usecprow=a->compressedrow.use;
1427: VecGetArrayRead(xx,&x);
1428: VecGetArray(zz,&zarray);
1430: v = a->a;
1431: if (usecprow) {
1432: mbs = a->compressedrow.nrows;
1433: ii = a->compressedrow.i;
1434: ridx = a->compressedrow.rindex;
1435: } else {
1436: mbs = a->mbs;
1437: ii = a->i;
1438: z = zarray;
1439: }
1441: for (i=0; i<mbs; i++) {
1442: n = ii[i+1] - ii[i];
1443: idx = ij + ii[i];
1444: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1445: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1447: for (j=0; j<n; j++) {
1448: xb = x + 15*(idx[j]);
1449: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1450: x8 = xb[7];
1452: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1453: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1454: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1455: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1456: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1457: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1458: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1459: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1460: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1461: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1462: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1463: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1464: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1465: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1466: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1467: v += 120;
1469: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1471: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1472: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1473: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1474: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1475: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1476: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1477: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1478: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1479: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1480: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1481: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1482: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1483: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1484: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1485: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1486: v += 105;
1487: }
1488: if (usecprow) z = zarray + 15*ridx[i];
1489: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1490: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1492: if (!usecprow) z += 15;
1493: }
1495: VecRestoreArrayRead(xx,&x);
1496: VecRestoreArray(zz,&zarray);
1497: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1498: return(0);
1499: }
1501: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1505: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1506: {
1507: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1508: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1509: const PetscScalar *x,*xb;
1510: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1511: const MatScalar *v;
1512: PetscErrorCode ierr;
1513: const PetscInt *ii,*ij=a->j,*idx;
1514: PetscInt mbs,i,j,n,*ridx=NULL;
1515: PetscBool usecprow=a->compressedrow.use;
1518: VecGetArrayRead(xx,&x);
1519: VecGetArray(zz,&zarray);
1521: v = a->a;
1522: if (usecprow) {
1523: mbs = a->compressedrow.nrows;
1524: ii = a->compressedrow.i;
1525: ridx = a->compressedrow.rindex;
1526: } else {
1527: mbs = a->mbs;
1528: ii = a->i;
1529: z = zarray;
1530: }
1532: for (i=0; i<mbs; i++) {
1533: n = ii[i+1] - ii[i];
1534: idx = ij + ii[i];
1535: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1536: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1538: for (j=0; j<n; j++) {
1539: xb = x + 15*(idx[j]);
1540: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1541: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1543: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1544: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1545: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1546: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1547: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1548: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1549: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1550: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1551: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1552: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1553: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1554: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1555: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1556: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1557: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1558: v += 225;
1559: }
1560: if (usecprow) z = zarray + 15*ridx[i];
1561: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1562: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1564: if (!usecprow) z += 15;
1565: }
1567: VecRestoreArrayRead(xx,&x);
1568: VecRestoreArray(zz,&zarray);
1569: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1570: return(0);
1571: }
1574: /*
1575: This will not work with MatScalar == float because it calls the BLAS
1576: */
1579: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1580: {
1581: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1582: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1583: MatScalar *v;
1585: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1586: PetscInt ncols,k,*ridx=NULL;
1587: PetscBool usecprow=a->compressedrow.use;
1590: VecGetArray(xx,&x);
1591: VecGetArray(zz,&zarray);
1593: idx = a->j;
1594: v = a->a;
1595: if (usecprow) {
1596: mbs = a->compressedrow.nrows;
1597: ii = a->compressedrow.i;
1598: ridx = a->compressedrow.rindex;
1599: } else {
1600: mbs = a->mbs;
1601: ii = a->i;
1602: z = zarray;
1603: }
1605: if (!a->mult_work) {
1606: k = PetscMax(A->rmap->n,A->cmap->n);
1607: PetscMalloc1((k+1),&a->mult_work);
1608: }
1609: work = a->mult_work;
1610: for (i=0; i<mbs; i++) {
1611: n = ii[1] - ii[0]; ii++;
1612: ncols = n*bs;
1613: workt = work;
1614: for (j=0; j<n; j++) {
1615: xb = x + bs*(*idx++);
1616: for (k=0; k<bs; k++) workt[k] = xb[k];
1617: workt += bs;
1618: }
1619: if (usecprow) z = zarray + bs*ridx[i];
1620: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1621: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1622: v += n*bs2;
1623: if (!usecprow) z += bs;
1624: }
1625: VecRestoreArray(xx,&x);
1626: VecRestoreArray(zz,&zarray);
1627: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1628: return(0);
1629: }
1633: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1634: {
1635: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1636: const PetscScalar *x;
1637: PetscScalar *y,*z,sum;
1638: const MatScalar *v;
1639: PetscErrorCode ierr;
1640: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1641: const PetscInt *idx,*ii;
1642: PetscBool usecprow=a->compressedrow.use;
1645: VecGetArrayRead(xx,&x);
1646: VecGetArray(yy,&y);
1647: if (zz != yy) {
1648: VecGetArray(zz,&z);
1649: } else {
1650: z = y;
1651: }
1653: idx = a->j;
1654: v = a->a;
1655: if (usecprow) {
1656: if (zz != yy) {
1657: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1658: }
1659: mbs = a->compressedrow.nrows;
1660: ii = a->compressedrow.i;
1661: ridx = a->compressedrow.rindex;
1662: } else {
1663: ii = a->i;
1664: }
1666: for (i=0; i<mbs; i++) {
1667: n = ii[1] - ii[0];
1668: ii++;
1669: if (!usecprow) {
1670: sum = y[i];
1671: } else {
1672: sum = y[ridx[i]];
1673: }
1674: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1675: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1676: PetscSparseDensePlusDot(sum,x,v,idx,n);
1677: v += n;
1678: idx += n;
1679: if (usecprow) {
1680: z[ridx[i]] = sum;
1681: } else {
1682: z[i] = sum;
1683: }
1684: }
1685: VecRestoreArrayRead(xx,&x);
1686: VecRestoreArray(yy,&y);
1687: if (zz != yy) {
1688: VecRestoreArray(zz,&z);
1689: }
1690: PetscLogFlops(2.0*a->nz);
1691: return(0);
1692: }
1696: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1697: {
1698: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1699: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
1700: PetscScalar x1,x2,*yarray,*zarray;
1701: MatScalar *v;
1703: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1704: PetscBool usecprow=a->compressedrow.use;
1707: VecGetArray(xx,&x);
1708: VecGetArray(yy,&yarray);
1709: if (zz != yy) {
1710: VecGetArray(zz,&zarray);
1711: } else {
1712: zarray = yarray;
1713: }
1715: idx = a->j;
1716: v = a->a;
1717: if (usecprow) {
1718: if (zz != yy) {
1719: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1720: }
1721: mbs = a->compressedrow.nrows;
1722: ii = a->compressedrow.i;
1723: ridx = a->compressedrow.rindex;
1724: if (zz != yy) {
1725: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1726: }
1727: } else {
1728: ii = a->i;
1729: y = yarray;
1730: z = zarray;
1731: }
1733: for (i=0; i<mbs; i++) {
1734: n = ii[1] - ii[0]; ii++;
1735: if (usecprow) {
1736: z = zarray + 2*ridx[i];
1737: y = yarray + 2*ridx[i];
1738: }
1739: sum1 = y[0]; sum2 = y[1];
1740: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1741: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1742: for (j=0; j<n; j++) {
1743: xb = x + 2*(*idx++);
1744: x1 = xb[0];
1745: x2 = xb[1];
1747: sum1 += v[0]*x1 + v[2]*x2;
1748: sum2 += v[1]*x1 + v[3]*x2;
1749: v += 4;
1750: }
1751: z[0] = sum1; z[1] = sum2;
1752: if (!usecprow) {
1753: z += 2; y += 2;
1754: }
1755: }
1756: VecRestoreArray(xx,&x);
1757: VecRestoreArray(yy,&yarray);
1758: if (zz != yy) {
1759: VecRestoreArray(zz,&zarray);
1760: }
1761: PetscLogFlops(4.0*a->nz);
1762: return(0);
1763: }
1767: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1768: {
1769: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1770: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1771: MatScalar *v;
1773: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1774: PetscBool usecprow=a->compressedrow.use;
1777: VecGetArray(xx,&x);
1778: VecGetArray(yy,&yarray);
1779: if (zz != yy) {
1780: VecGetArray(zz,&zarray);
1781: } else {
1782: zarray = yarray;
1783: }
1785: idx = a->j;
1786: v = a->a;
1787: if (usecprow) {
1788: if (zz != yy) {
1789: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1790: }
1791: mbs = a->compressedrow.nrows;
1792: ii = a->compressedrow.i;
1793: ridx = a->compressedrow.rindex;
1794: } else {
1795: ii = a->i;
1796: y = yarray;
1797: z = zarray;
1798: }
1800: for (i=0; i<mbs; i++) {
1801: n = ii[1] - ii[0]; ii++;
1802: if (usecprow) {
1803: z = zarray + 3*ridx[i];
1804: y = yarray + 3*ridx[i];
1805: }
1806: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1807: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1808: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1809: for (j=0; j<n; j++) {
1810: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1811: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1812: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1813: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1814: v += 9;
1815: }
1816: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1817: if (!usecprow) {
1818: z += 3; y += 3;
1819: }
1820: }
1821: VecRestoreArray(xx,&x);
1822: VecRestoreArray(yy,&yarray);
1823: if (zz != yy) {
1824: VecRestoreArray(zz,&zarray);
1825: }
1826: PetscLogFlops(18.0*a->nz);
1827: return(0);
1828: }
1832: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1833: {
1834: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1835: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1836: MatScalar *v;
1838: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1839: PetscBool usecprow=a->compressedrow.use;
1842: VecGetArray(xx,&x);
1843: VecGetArray(yy,&yarray);
1844: if (zz != yy) {
1845: VecGetArray(zz,&zarray);
1846: } else {
1847: zarray = yarray;
1848: }
1850: idx = a->j;
1851: v = a->a;
1852: if (usecprow) {
1853: if (zz != yy) {
1854: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1855: }
1856: mbs = a->compressedrow.nrows;
1857: ii = a->compressedrow.i;
1858: ridx = a->compressedrow.rindex;
1859: } else {
1860: ii = a->i;
1861: y = yarray;
1862: z = zarray;
1863: }
1865: for (i=0; i<mbs; i++) {
1866: n = ii[1] - ii[0]; ii++;
1867: if (usecprow) {
1868: z = zarray + 4*ridx[i];
1869: y = yarray + 4*ridx[i];
1870: }
1871: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1872: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1873: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1874: for (j=0; j<n; j++) {
1875: xb = x + 4*(*idx++);
1876: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1877: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1878: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1879: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1880: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1881: v += 16;
1882: }
1883: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1884: if (!usecprow) {
1885: z += 4; y += 4;
1886: }
1887: }
1888: VecRestoreArray(xx,&x);
1889: VecRestoreArray(yy,&yarray);
1890: if (zz != yy) {
1891: VecRestoreArray(zz,&zarray);
1892: }
1893: PetscLogFlops(32.0*a->nz);
1894: return(0);
1895: }
1899: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1900: {
1901: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1902: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1903: PetscScalar *yarray,*zarray;
1904: MatScalar *v;
1906: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1907: PetscBool usecprow=a->compressedrow.use;
1910: VecGetArray(xx,&x);
1911: VecGetArray(yy,&yarray);
1912: if (zz != yy) {
1913: VecGetArray(zz,&zarray);
1914: } else {
1915: zarray = yarray;
1916: }
1918: idx = a->j;
1919: v = a->a;
1920: if (usecprow) {
1921: if (zz != yy) {
1922: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1923: }
1924: mbs = a->compressedrow.nrows;
1925: ii = a->compressedrow.i;
1926: ridx = a->compressedrow.rindex;
1927: } else {
1928: ii = a->i;
1929: y = yarray;
1930: z = zarray;
1931: }
1933: for (i=0; i<mbs; i++) {
1934: n = ii[1] - ii[0]; ii++;
1935: if (usecprow) {
1936: z = zarray + 5*ridx[i];
1937: y = yarray + 5*ridx[i];
1938: }
1939: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1940: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1941: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1942: for (j=0; j<n; j++) {
1943: xb = x + 5*(*idx++);
1944: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1945: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1946: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1947: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1948: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1949: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1950: v += 25;
1951: }
1952: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1953: if (!usecprow) {
1954: z += 5; y += 5;
1955: }
1956: }
1957: VecRestoreArray(xx,&x);
1958: VecRestoreArray(yy,&yarray);
1959: if (zz != yy) {
1960: VecRestoreArray(zz,&zarray);
1961: }
1962: PetscLogFlops(50.0*a->nz);
1963: return(0);
1964: }
1967: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1968: {
1969: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1970: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1971: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1972: MatScalar *v;
1974: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
1975: PetscBool usecprow=a->compressedrow.use;
1978: VecGetArray(xx,&x);
1979: VecGetArray(yy,&yarray);
1980: if (zz != yy) {
1981: VecGetArray(zz,&zarray);
1982: } else {
1983: zarray = yarray;
1984: }
1986: idx = a->j;
1987: v = a->a;
1988: if (usecprow) {
1989: if (zz != yy) {
1990: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1991: }
1992: mbs = a->compressedrow.nrows;
1993: ii = a->compressedrow.i;
1994: ridx = a->compressedrow.rindex;
1995: } else {
1996: ii = a->i;
1997: y = yarray;
1998: z = zarray;
1999: }
2001: for (i=0; i<mbs; i++) {
2002: n = ii[1] - ii[0]; ii++;
2003: if (usecprow) {
2004: z = zarray + 6*ridx[i];
2005: y = yarray + 6*ridx[i];
2006: }
2007: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2008: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2009: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2010: for (j=0; j<n; j++) {
2011: xb = x + 6*(*idx++);
2012: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2013: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2014: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2015: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2016: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2017: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2018: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2019: v += 36;
2020: }
2021: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2022: if (!usecprow) {
2023: z += 6; y += 6;
2024: }
2025: }
2026: VecRestoreArray(xx,&x);
2027: VecRestoreArray(yy,&yarray);
2028: if (zz != yy) {
2029: VecRestoreArray(zz,&zarray);
2030: }
2031: PetscLogFlops(72.0*a->nz);
2032: return(0);
2033: }
2037: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2038: {
2039: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2040: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2041: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2042: MatScalar *v;
2044: PetscInt mbs =a->mbs,i,*idx,*ii,j,n,*ridx=NULL;
2045: PetscBool usecprow=a->compressedrow.use;
2048: VecGetArray(xx,&x);
2049: VecGetArray(yy,&yarray);
2050: if (zz != yy) {
2051: VecGetArray(zz,&zarray);
2052: } else {
2053: zarray = yarray;
2054: }
2056: idx = a->j;
2057: v = a->a;
2058: if (usecprow) {
2059: if (zz != yy) {
2060: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
2061: }
2062: mbs = a->compressedrow.nrows;
2063: ii = a->compressedrow.i;
2064: ridx = a->compressedrow.rindex;
2065: } else {
2066: ii = a->i;
2067: y = yarray;
2068: z = zarray;
2069: }
2071: for (i=0; i<mbs; i++) {
2072: n = ii[1] - ii[0]; ii++;
2073: if (usecprow) {
2074: z = zarray + 7*ridx[i];
2075: y = yarray + 7*ridx[i];
2076: }
2077: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2078: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2079: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2080: for (j=0; j<n; j++) {
2081: xb = x + 7*(*idx++);
2082: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2083: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2084: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2085: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2086: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2087: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2088: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2089: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2090: v += 49;
2091: }
2092: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2093: if (!usecprow) {
2094: z += 7; y += 7;
2095: }
2096: }
2097: VecRestoreArray(xx,&x);
2098: VecRestoreArray(yy,&yarray);
2099: if (zz != yy) {
2100: VecRestoreArray(zz,&zarray);
2101: }
2102: PetscLogFlops(98.0*a->nz);
2103: return(0);
2104: }
2108: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2109: {
2110: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2111: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
2112: MatScalar *v;
2114: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
2115: PetscInt ncols,k,*ridx=NULL;
2116: PetscBool usecprow=a->compressedrow.use;
2119: VecCopy(yy,zz);
2120: VecGetArray(xx,&x);
2121: VecGetArray(zz,&zarray);
2123: idx = a->j;
2124: v = a->a;
2125: if (usecprow) {
2126: mbs = a->compressedrow.nrows;
2127: ii = a->compressedrow.i;
2128: ridx = a->compressedrow.rindex;
2129: } else {
2130: mbs = a->mbs;
2131: ii = a->i;
2132: z = zarray;
2133: }
2135: if (!a->mult_work) {
2136: k = PetscMax(A->rmap->n,A->cmap->n);
2137: PetscMalloc1((k+1),&a->mult_work);
2138: }
2139: work = a->mult_work;
2140: for (i=0; i<mbs; i++) {
2141: n = ii[1] - ii[0]; ii++;
2142: ncols = n*bs;
2143: workt = work;
2144: for (j=0; j<n; j++) {
2145: xb = x + bs*(*idx++);
2146: for (k=0; k<bs; k++) workt[k] = xb[k];
2147: workt += bs;
2148: }
2149: if (usecprow) z = zarray + bs*ridx[i];
2150: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2151: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
2152: v += n*bs2;
2153: if (!usecprow) z += bs;
2154: }
2155: VecRestoreArray(xx,&x);
2156: VecRestoreArray(zz,&zarray);
2157: PetscLogFlops(2.0*a->nz*bs2);
2158: return(0);
2159: }
2163: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2164: {
2165: PetscScalar zero = 0.0;
2169: VecSet(zz,zero);
2170: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2171: return(0);
2172: }
2176: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2177: {
2178: PetscScalar zero = 0.0;
2182: VecSet(zz,zero);
2183: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2184: return(0);
2185: }
2189: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2190: {
2191: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2192: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2193: MatScalar *v;
2194: PetscErrorCode ierr;
2195: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2196: Mat_CompressedRow cprow = a->compressedrow;
2197: PetscBool usecprow=cprow.use;
2200: if (yy != zz) { VecCopy(yy,zz); }
2201: VecGetArray(xx,&x);
2202: VecGetArray(zz,&z);
2204: idx = a->j;
2205: v = a->a;
2206: if (usecprow) {
2207: mbs = cprow.nrows;
2208: ii = cprow.i;
2209: ridx = cprow.rindex;
2210: } else {
2211: mbs=a->mbs;
2212: ii = a->i;
2213: xb = x;
2214: }
2216: switch (bs) {
2217: case 1:
2218: for (i=0; i<mbs; i++) {
2219: if (usecprow) xb = x + ridx[i];
2220: x1 = xb[0];
2221: ib = idx + ii[0];
2222: n = ii[1] - ii[0]; ii++;
2223: for (j=0; j<n; j++) {
2224: rval = ib[j];
2225: z[rval] += PetscConj(*v) * x1;
2226: v++;
2227: }
2228: if (!usecprow) xb++;
2229: }
2230: break;
2231: case 2:
2232: for (i=0; i<mbs; i++) {
2233: if (usecprow) xb = x + 2*ridx[i];
2234: x1 = xb[0]; x2 = xb[1];
2235: ib = idx + ii[0];
2236: n = ii[1] - ii[0]; ii++;
2237: for (j=0; j<n; j++) {
2238: rval = ib[j]*2;
2239: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2240: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2241: v += 4;
2242: }
2243: if (!usecprow) xb += 2;
2244: }
2245: break;
2246: case 3:
2247: for (i=0; i<mbs; i++) {
2248: if (usecprow) xb = x + 3*ridx[i];
2249: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2250: ib = idx + ii[0];
2251: n = ii[1] - ii[0]; ii++;
2252: for (j=0; j<n; j++) {
2253: rval = ib[j]*3;
2254: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2255: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2256: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2257: v += 9;
2258: }
2259: if (!usecprow) xb += 3;
2260: }
2261: break;
2262: case 4:
2263: for (i=0; i<mbs; i++) {
2264: if (usecprow) xb = x + 4*ridx[i];
2265: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2266: ib = idx + ii[0];
2267: n = ii[1] - ii[0]; ii++;
2268: for (j=0; j<n; j++) {
2269: rval = ib[j]*4;
2270: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2271: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2272: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2273: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2274: v += 16;
2275: }
2276: if (!usecprow) xb += 4;
2277: }
2278: break;
2279: case 5:
2280: for (i=0; i<mbs; i++) {
2281: if (usecprow) xb = x + 5*ridx[i];
2282: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2283: x4 = xb[3]; x5 = xb[4];
2284: ib = idx + ii[0];
2285: n = ii[1] - ii[0]; ii++;
2286: for (j=0; j<n; j++) {
2287: rval = ib[j]*5;
2288: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2289: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2290: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2291: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2292: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2293: v += 25;
2294: }
2295: if (!usecprow) xb += 5;
2296: }
2297: break;
2298: default: { /* block sizes larger than 5 by 5 are handled by BLAS */
2299: PetscInt ncols,k;
2300: PetscScalar *work,*workt,*xtmp;
2302: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2303: if (!a->mult_work) {
2304: k = PetscMax(A->rmap->n,A->cmap->n);
2305: PetscMalloc1((k+1),&a->mult_work);
2306: }
2307: work = a->mult_work;
2308: xtmp = x;
2309: for (i=0; i<mbs; i++) {
2310: n = ii[1] - ii[0]; ii++;
2311: ncols = n*bs;
2312: PetscMemzero(work,ncols*sizeof(PetscScalar));
2313: if (usecprow) xtmp = x + bs*ridx[i];
2314: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2315: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2316: v += n*bs2;
2317: if (!usecprow) xtmp += bs;
2318: workt = work;
2319: for (j=0; j<n; j++) {
2320: zb = z + bs*(*idx++);
2321: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2322: workt += bs;
2323: }
2324: }
2325: }
2326: }
2327: VecRestoreArray(xx,&x);
2328: VecRestoreArray(zz,&z);
2329: PetscLogFlops(2.0*a->nz*a->bs2);
2330: return(0);
2331: }
2335: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2336: {
2337: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2338: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
2339: MatScalar *v;
2340: PetscErrorCode ierr;
2341: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=NULL;
2342: Mat_CompressedRow cprow = a->compressedrow;
2343: PetscBool usecprow=cprow.use;
2346: if (yy != zz) { VecCopy(yy,zz); }
2347: VecGetArray(xx,&x);
2348: VecGetArray(zz,&z);
2350: idx = a->j;
2351: v = a->a;
2352: if (usecprow) {
2353: mbs = cprow.nrows;
2354: ii = cprow.i;
2355: ridx = cprow.rindex;
2356: } else {
2357: mbs=a->mbs;
2358: ii = a->i;
2359: xb = x;
2360: }
2362: switch (bs) {
2363: case 1:
2364: for (i=0; i<mbs; i++) {
2365: if (usecprow) xb = x + ridx[i];
2366: x1 = xb[0];
2367: ib = idx + ii[0];
2368: n = ii[1] - ii[0]; ii++;
2369: for (j=0; j<n; j++) {
2370: rval = ib[j];
2371: z[rval] += *v * x1;
2372: v++;
2373: }
2374: if (!usecprow) xb++;
2375: }
2376: break;
2377: case 2:
2378: for (i=0; i<mbs; i++) {
2379: if (usecprow) xb = x + 2*ridx[i];
2380: x1 = xb[0]; x2 = xb[1];
2381: ib = idx + ii[0];
2382: n = ii[1] - ii[0]; ii++;
2383: for (j=0; j<n; j++) {
2384: rval = ib[j]*2;
2385: z[rval++] += v[0]*x1 + v[1]*x2;
2386: z[rval++] += v[2]*x1 + v[3]*x2;
2387: v += 4;
2388: }
2389: if (!usecprow) xb += 2;
2390: }
2391: break;
2392: case 3:
2393: for (i=0; i<mbs; i++) {
2394: if (usecprow) xb = x + 3*ridx[i];
2395: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2396: ib = idx + ii[0];
2397: n = ii[1] - ii[0]; ii++;
2398: for (j=0; j<n; j++) {
2399: rval = ib[j]*3;
2400: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2401: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2402: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2403: v += 9;
2404: }
2405: if (!usecprow) xb += 3;
2406: }
2407: break;
2408: case 4:
2409: for (i=0; i<mbs; i++) {
2410: if (usecprow) xb = x + 4*ridx[i];
2411: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2412: ib = idx + ii[0];
2413: n = ii[1] - ii[0]; ii++;
2414: for (j=0; j<n; j++) {
2415: rval = ib[j]*4;
2416: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2417: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2418: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2419: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2420: v += 16;
2421: }
2422: if (!usecprow) xb += 4;
2423: }
2424: break;
2425: case 5:
2426: for (i=0; i<mbs; i++) {
2427: if (usecprow) xb = x + 5*ridx[i];
2428: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2429: x4 = xb[3]; x5 = xb[4];
2430: ib = idx + ii[0];
2431: n = ii[1] - ii[0]; ii++;
2432: for (j=0; j<n; j++) {
2433: rval = ib[j]*5;
2434: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2435: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2436: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2437: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2438: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2439: v += 25;
2440: }
2441: if (!usecprow) xb += 5;
2442: }
2443: break;
2444: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2445: PetscInt ncols,k;
2446: PetscScalar *work,*workt,*xtmp;
2448: if (!a->mult_work) {
2449: k = PetscMax(A->rmap->n,A->cmap->n);
2450: PetscMalloc1((k+1),&a->mult_work);
2451: }
2452: work = a->mult_work;
2453: xtmp = x;
2454: for (i=0; i<mbs; i++) {
2455: n = ii[1] - ii[0]; ii++;
2456: ncols = n*bs;
2457: PetscMemzero(work,ncols*sizeof(PetscScalar));
2458: if (usecprow) xtmp = x + bs*ridx[i];
2459: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2460: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2461: v += n*bs2;
2462: if (!usecprow) xtmp += bs;
2463: workt = work;
2464: for (j=0; j<n; j++) {
2465: zb = z + bs*(*idx++);
2466: for (k=0; k<bs; k++) zb[k] += workt[k];
2467: workt += bs;
2468: }
2469: }
2470: }
2471: }
2472: VecRestoreArray(xx,&x);
2473: VecRestoreArray(zz,&z);
2474: PetscLogFlops(2.0*a->nz*a->bs2);
2475: return(0);
2476: }
2480: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2481: {
2482: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2483: PetscInt totalnz = a->bs2*a->nz;
2484: PetscScalar oalpha = alpha;
2486: PetscBLASInt one = 1,tnz;
2489: PetscBLASIntCast(totalnz,&tnz);
2490: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2491: PetscLogFlops(totalnz);
2492: return(0);
2493: }
2497: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2498: {
2500: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2501: MatScalar *v = a->a;
2502: PetscReal sum = 0.0;
2503: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2506: if (type == NORM_FROBENIUS) {
2507: for (i=0; i< bs2*nz; i++) {
2508: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2509: }
2510: *norm = PetscSqrtReal(sum);
2511: } else if (type == NORM_1) { /* maximum column sum */
2512: PetscReal *tmp;
2513: PetscInt *bcol = a->j;
2514: PetscCalloc1((A->cmap->n+1),&tmp);
2515: for (i=0; i<nz; i++) {
2516: for (j=0; j<bs; j++) {
2517: k1 = bs*(*bcol) + j; /* column index */
2518: for (k=0; k<bs; k++) {
2519: tmp[k1] += PetscAbsScalar(*v); v++;
2520: }
2521: }
2522: bcol++;
2523: }
2524: *norm = 0.0;
2525: for (j=0; j<A->cmap->n; j++) {
2526: if (tmp[j] > *norm) *norm = tmp[j];
2527: }
2528: PetscFree(tmp);
2529: } else if (type == NORM_INFINITY) { /* maximum row sum */
2530: *norm = 0.0;
2531: for (k=0; k<bs; k++) {
2532: for (j=0; j<a->mbs; j++) {
2533: v = a->a + bs2*a->i[j] + k;
2534: sum = 0.0;
2535: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2536: for (k1=0; k1<bs; k1++) {
2537: sum += PetscAbsScalar(*v);
2538: v += bs;
2539: }
2540: }
2541: if (sum > *norm) *norm = sum;
2542: }
2543: }
2544: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2545: return(0);
2546: }
2551: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2552: {
2553: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2557: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2558: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2559: *flg = PETSC_FALSE;
2560: return(0);
2561: }
2563: /* if the a->i are the same */
2564: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
2565: if (!*flg) return(0);
2567: /* if a->j are the same */
2568: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
2569: if (!*flg) return(0);
2571: /* if a->a are the same */
2572: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
2573: return(0);
2575: }
2579: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2580: {
2581: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2583: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2584: PetscScalar *x,zero = 0.0;
2585: MatScalar *aa,*aa_j;
2588: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2589: bs = A->rmap->bs;
2590: aa = a->a;
2591: ai = a->i;
2592: aj = a->j;
2593: ambs = a->mbs;
2594: bs2 = a->bs2;
2596: VecSet(v,zero);
2597: VecGetArray(v,&x);
2598: VecGetLocalSize(v,&n);
2599: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2600: for (i=0; i<ambs; i++) {
2601: for (j=ai[i]; j<ai[i+1]; j++) {
2602: if (aj[j] == i) {
2603: row = i*bs;
2604: aa_j = aa+j*bs2;
2605: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2606: break;
2607: }
2608: }
2609: }
2610: VecRestoreArray(v,&x);
2611: return(0);
2612: }
2616: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2617: {
2618: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2619: const PetscScalar *l,*r,*li,*ri;
2620: PetscScalar x;
2621: MatScalar *aa, *v;
2622: PetscErrorCode ierr;
2623: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2624: const PetscInt *ai,*aj;
2627: ai = a->i;
2628: aj = a->j;
2629: aa = a->a;
2630: m = A->rmap->n;
2631: n = A->cmap->n;
2632: bs = A->rmap->bs;
2633: mbs = a->mbs;
2634: bs2 = a->bs2;
2635: if (ll) {
2636: VecGetArrayRead(ll,&l);
2637: VecGetLocalSize(ll,&lm);
2638: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2639: for (i=0; i<mbs; i++) { /* for each block row */
2640: M = ai[i+1] - ai[i];
2641: li = l + i*bs;
2642: v = aa + bs2*ai[i];
2643: for (j=0; j<M; j++) { /* for each block */
2644: for (k=0; k<bs2; k++) {
2645: (*v++) *= li[k%bs];
2646: }
2647: }
2648: }
2649: VecRestoreArrayRead(ll,&l);
2650: PetscLogFlops(a->nz);
2651: }
2653: if (rr) {
2654: VecGetArrayRead(rr,&r);
2655: VecGetLocalSize(rr,&rn);
2656: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2657: for (i=0; i<mbs; i++) { /* for each block row */
2658: iai = ai[i];
2659: M = ai[i+1] - iai;
2660: v = aa + bs2*iai;
2661: for (j=0; j<M; j++) { /* for each block */
2662: ri = r + bs*aj[iai+j];
2663: for (k=0; k<bs; k++) {
2664: x = ri[k];
2665: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2666: v += bs;
2667: }
2668: }
2669: }
2670: VecRestoreArrayRead(rr,&r);
2671: PetscLogFlops(a->nz);
2672: }
2673: return(0);
2674: }
2679: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2680: {
2681: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2684: info->block_size = a->bs2;
2685: info->nz_allocated = a->bs2*a->maxnz;
2686: info->nz_used = a->bs2*a->nz;
2687: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2688: info->assemblies = A->num_ass;
2689: info->mallocs = A->info.mallocs;
2690: info->memory = ((PetscObject)A)->mem;
2691: if (A->factortype) {
2692: info->fill_ratio_given = A->info.fill_ratio_given;
2693: info->fill_ratio_needed = A->info.fill_ratio_needed;
2694: info->factor_mallocs = A->info.factor_mallocs;
2695: } else {
2696: info->fill_ratio_given = 0;
2697: info->fill_ratio_needed = 0;
2698: info->factor_mallocs = 0;
2699: }
2700: return(0);
2701: }
2704: #if defined(PETSC_THREADCOMM_ACTIVE)
2705: PetscErrorCode MatZeroEntries_SeqBAIJ_Kernel(PetscInt thread_id,Mat A)
2706: {
2708: PetscInt *trstarts=A->rmap->trstarts;
2709: PetscInt n,start,end;
2710: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2712: start = trstarts[thread_id];
2713: end = trstarts[thread_id+1];
2714: n = a->i[end] - a->i[start];
2715: PetscMemzero(a->a+a->bs2*a->i[start],a->bs2*n*sizeof(PetscScalar));
2716: return 0;
2717: }
2721: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2722: {
2725:
2727: PetscThreadCommRunKernel(PetscObjectComm((PetscObject)A),(PetscThreadKernel)MatZeroEntries_SeqBAIJ_Kernel,1,A);
2728: return(0);
2729: }
2730: #else
2733: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2734: {
2735: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2739: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2740: return(0);
2741: }
2742: #endif