Actual source code: baij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
8: #include <immintrin.h>
9: #endif
11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
12: {
13: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
14: PetscInt row, i, j, k, l, m, n, *nidx, isz, val, ival;
15: const PetscInt *idx;
16: PetscInt start, end, *ai, *aj, bs;
17: PetscBT table;
19: PetscFunctionBegin;
20: m = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
25: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
27: PetscCall(PetscBTCreate(m, &table));
28: PetscCall(PetscMalloc1(m + 1, &nidx));
30: for (i = 0; i < is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscCall(PetscBTMemzero(m, table));
35: /* Extract the indices, assume there can be duplicate entries */
36: PetscCall(ISGetIndices(is[i], &idx));
37: PetscCall(ISGetLocalSize(is[i], &n));
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j = 0; j < n; ++j) {
41: ival = idx[j] / bs; /* convert the indices into block indices */
42: PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
43: if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
44: }
45: PetscCall(ISRestoreIndices(is[i], &idx));
46: PetscCall(ISDestroy(&is[i]));
48: k = 0;
49: for (j = 0; j < ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row + 1];
55: for (l = start; l < end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
58: }
59: }
60: }
61: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
62: }
63: PetscCall(PetscBTDestroy(&table));
64: PetscCall(PetscFree(nidx));
65: PetscFunctionReturn(PETSC_SUCCESS);
66: }
68: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
69: {
70: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *c;
71: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
72: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
73: const PetscInt *irow, *icol;
74: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
75: PetscInt *aj = a->j, *ai = a->i;
76: MatScalar *mat_a;
77: Mat C;
78: PetscBool flag;
80: PetscFunctionBegin;
81: PetscCall(ISGetIndices(isrow, &irow));
82: PetscCall(ISGetIndices(iscol, &icol));
83: PetscCall(ISGetLocalSize(isrow, &nrows));
84: PetscCall(ISGetLocalSize(iscol, &ncols));
86: PetscCall(PetscCalloc1(1 + oldcols, &smap));
87: ssmap = smap;
88: PetscCall(PetscMalloc1(1 + nrows, &lens));
89: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
90: /* determine lens of each row */
91: for (i = 0; i < nrows; i++) {
92: kstart = ai[irow[i]];
93: kend = kstart + a->ilen[irow[i]];
94: lens[i] = 0;
95: for (k = kstart; k < kend; k++) {
96: if (ssmap[aj[k]]) lens[i]++;
97: }
98: }
99: /* Create and fill new matrix */
100: if (scall == MAT_REUSE_MATRIX) {
101: c = (Mat_SeqBAIJ *)((*B)->data);
103: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106: PetscCall(PetscArrayzero(c->ilen, c->mbs));
107: C = *B;
108: } else {
109: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113: }
114: c = (Mat_SeqBAIJ *)(C->data);
115: for (i = 0; i < nrows; i++) {
116: row = irow[i];
117: kstart = ai[row];
118: kend = kstart + a->ilen[row];
119: mat_i = c->i[i];
120: mat_j = c->j ? c->j + mat_i : NULL; /* mustn't add to NULL, that is UB */
121: mat_a = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122: mat_ilen = c->ilen + i;
123: for (k = kstart; k < kend; k++) {
124: if ((tcol = ssmap[a->j[k]])) {
125: *mat_j++ = tcol - 1;
126: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127: mat_a += bs2;
128: (*mat_ilen)++;
129: }
130: }
131: }
132: /* sort */
133: if (c->j && c->a) {
134: MatScalar *work;
135: PetscCall(PetscMalloc1(bs2, &work));
136: for (i = 0; i < nrows; i++) {
137: PetscInt ilen;
138: mat_i = c->i[i];
139: mat_j = c->j + mat_i;
140: mat_a = c->a + mat_i * bs2;
141: ilen = c->ilen[i];
142: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143: }
144: PetscCall(PetscFree(work));
145: }
147: /* Free work space */
148: PetscCall(ISRestoreIndices(iscol, &icol));
149: PetscCall(PetscFree(smap));
150: PetscCall(PetscFree(lens));
151: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
154: PetscCall(ISRestoreIndices(isrow, &irow));
155: *B = C;
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160: {
161: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
162: IS is1, is2;
163: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
164: const PetscInt *irow, *icol;
166: PetscFunctionBegin;
167: PetscCall(ISGetIndices(isrow, &irow));
168: PetscCall(ISGetIndices(iscol, &icol));
169: PetscCall(ISGetLocalSize(isrow, &nrows));
170: PetscCall(ISGetLocalSize(iscol, &ncols));
172: /* Verify if the indices correspond to each element in a block
173: and form the IS with compressed IS */
174: maxmnbs = PetscMax(a->mbs, a->nbs);
175: PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
176: PetscCall(PetscArrayzero(vary, a->mbs));
177: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
178: for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
179: count = 0;
180: for (i = 0; i < nrows; i++) {
181: j = irow[i] / bs;
182: if ((vary[j]--) == bs) iary[count++] = j;
183: }
184: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
186: PetscCall(PetscArrayzero(vary, a->nbs));
187: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
188: for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
189: count = 0;
190: for (i = 0; i < ncols; i++) {
191: j = icol[i] / bs;
192: if ((vary[j]--) == bs) iary[count++] = j;
193: }
194: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
195: PetscCall(ISRestoreIndices(isrow, &irow));
196: PetscCall(ISRestoreIndices(iscol, &icol));
197: PetscCall(PetscFree2(vary, iary));
199: PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
200: PetscCall(ISDestroy(&is1));
201: PetscCall(ISDestroy(&is2));
202: PetscFunctionReturn(PETSC_SUCCESS);
203: }
205: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
206: {
207: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data;
208: Mat_SubSppt *submatj = c->submatis1;
210: PetscFunctionBegin;
211: PetscCall((*submatj->destroy)(C));
212: PetscCall(MatDestroySubMatrix_Private(submatj));
213: PetscFunctionReturn(PETSC_SUCCESS);
214: }
216: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
217: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
218: {
219: PetscInt i;
220: Mat C;
221: Mat_SeqBAIJ *c;
222: Mat_SubSppt *submatj;
224: PetscFunctionBegin;
225: for (i = 0; i < n; i++) {
226: C = (*mat)[i];
227: c = (Mat_SeqBAIJ *)C->data;
228: submatj = c->submatis1;
229: if (submatj) {
230: if (--((PetscObject)C)->refct <= 0) {
231: PetscCall(PetscFree(C->factorprefix));
232: PetscCall((*submatj->destroy)(C));
233: PetscCall(MatDestroySubMatrix_Private(submatj));
234: PetscCall(PetscFree(C->defaultvectype));
235: PetscCall(PetscFree(C->defaultrandtype));
236: PetscCall(PetscLayoutDestroy(&C->rmap));
237: PetscCall(PetscLayoutDestroy(&C->cmap));
238: PetscCall(PetscHeaderDestroy(&C));
239: }
240: } else {
241: PetscCall(MatDestroy(&C));
242: }
243: }
245: /* Destroy Dummy submatrices created for reuse */
246: PetscCall(MatDestroySubMatrices_Dummy(n, mat));
248: PetscCall(PetscFree(*mat));
249: PetscFunctionReturn(PETSC_SUCCESS);
250: }
252: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
253: {
254: PetscInt i;
256: PetscFunctionBegin;
257: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
259: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260: PetscFunctionReturn(PETSC_SUCCESS);
261: }
263: /* Should check that shapes of vectors and matrices match */
264: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
265: {
266: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
267: PetscScalar *z, sum;
268: const PetscScalar *x;
269: const MatScalar *v;
270: PetscInt mbs, i, n;
271: const PetscInt *idx, *ii, *ridx = NULL;
272: PetscBool usecprow = a->compressedrow.use;
274: PetscFunctionBegin;
275: PetscCall(VecGetArrayRead(xx, &x));
276: PetscCall(VecGetArrayWrite(zz, &z));
278: if (usecprow) {
279: mbs = a->compressedrow.nrows;
280: ii = a->compressedrow.i;
281: ridx = a->compressedrow.rindex;
282: PetscCall(PetscArrayzero(z, a->mbs));
283: } else {
284: mbs = a->mbs;
285: ii = a->i;
286: }
288: for (i = 0; i < mbs; i++) {
289: n = ii[1] - ii[0];
290: v = a->a + ii[0];
291: idx = a->j + ii[0];
292: ii++;
293: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
294: PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
295: sum = 0.0;
296: PetscSparseDensePlusDot(sum, x, v, idx, n);
297: if (usecprow) {
298: z[ridx[i]] = sum;
299: } else {
300: z[i] = sum;
301: }
302: }
303: PetscCall(VecRestoreArrayRead(xx, &x));
304: PetscCall(VecRestoreArrayWrite(zz, &z));
305: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
306: PetscFunctionReturn(PETSC_SUCCESS);
307: }
309: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
310: {
311: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
312: PetscScalar *z = NULL, sum1, sum2, *zarray;
313: const PetscScalar *x, *xb;
314: PetscScalar x1, x2;
315: const MatScalar *v;
316: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
317: PetscBool usecprow = a->compressedrow.use;
319: PetscFunctionBegin;
320: PetscCall(VecGetArrayRead(xx, &x));
321: PetscCall(VecGetArrayWrite(zz, &zarray));
323: idx = a->j;
324: v = a->a;
325: if (usecprow) {
326: mbs = a->compressedrow.nrows;
327: ii = a->compressedrow.i;
328: ridx = a->compressedrow.rindex;
329: PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
330: } else {
331: mbs = a->mbs;
332: ii = a->i;
333: z = zarray;
334: }
336: for (i = 0; i < mbs; i++) {
337: n = ii[1] - ii[0];
338: ii++;
339: sum1 = 0.0;
340: sum2 = 0.0;
341: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
342: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
343: for (j = 0; j < n; j++) {
344: xb = x + 2 * (*idx++);
345: x1 = xb[0];
346: x2 = xb[1];
347: sum1 += v[0] * x1 + v[2] * x2;
348: sum2 += v[1] * x1 + v[3] * x2;
349: v += 4;
350: }
351: if (usecprow) z = zarray + 2 * ridx[i];
352: z[0] = sum1;
353: z[1] = sum2;
354: if (!usecprow) z += 2;
355: }
356: PetscCall(VecRestoreArrayRead(xx, &x));
357: PetscCall(VecRestoreArrayWrite(zz, &zarray));
358: PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
359: PetscFunctionReturn(PETSC_SUCCESS);
360: }
362: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
363: {
364: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
365: PetscScalar *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
366: const PetscScalar *x, *xb;
367: const MatScalar *v;
368: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
369: PetscBool usecprow = a->compressedrow.use;
371: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
372: #pragma disjoint(*v, *z, *xb)
373: #endif
375: PetscFunctionBegin;
376: PetscCall(VecGetArrayRead(xx, &x));
377: PetscCall(VecGetArrayWrite(zz, &zarray));
379: idx = a->j;
380: v = a->a;
381: if (usecprow) {
382: mbs = a->compressedrow.nrows;
383: ii = a->compressedrow.i;
384: ridx = a->compressedrow.rindex;
385: PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
386: } else {
387: mbs = a->mbs;
388: ii = a->i;
389: z = zarray;
390: }
392: for (i = 0; i < mbs; i++) {
393: n = ii[1] - ii[0];
394: ii++;
395: sum1 = 0.0;
396: sum2 = 0.0;
397: sum3 = 0.0;
398: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
399: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400: for (j = 0; j < n; j++) {
401: xb = x + 3 * (*idx++);
402: x1 = xb[0];
403: x2 = xb[1];
404: x3 = xb[2];
406: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409: v += 9;
410: }
411: if (usecprow) z = zarray + 3 * ridx[i];
412: z[0] = sum1;
413: z[1] = sum2;
414: z[2] = sum3;
415: if (!usecprow) z += 3;
416: }
417: PetscCall(VecRestoreArrayRead(xx, &x));
418: PetscCall(VecRestoreArrayWrite(zz, &zarray));
419: PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
420: PetscFunctionReturn(PETSC_SUCCESS);
421: }
423: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
424: {
425: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
426: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
427: const PetscScalar *x, *xb;
428: const MatScalar *v;
429: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
430: PetscBool usecprow = a->compressedrow.use;
432: PetscFunctionBegin;
433: PetscCall(VecGetArrayRead(xx, &x));
434: PetscCall(VecGetArrayWrite(zz, &zarray));
436: idx = a->j;
437: v = a->a;
438: if (usecprow) {
439: mbs = a->compressedrow.nrows;
440: ii = a->compressedrow.i;
441: ridx = a->compressedrow.rindex;
442: PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
443: } else {
444: mbs = a->mbs;
445: ii = a->i;
446: z = zarray;
447: }
449: for (i = 0; i < mbs; i++) {
450: n = ii[1] - ii[0];
451: ii++;
452: sum1 = 0.0;
453: sum2 = 0.0;
454: sum3 = 0.0;
455: sum4 = 0.0;
457: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
458: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
459: for (j = 0; j < n; j++) {
460: xb = x + 4 * (*idx++);
461: x1 = xb[0];
462: x2 = xb[1];
463: x3 = xb[2];
464: x4 = xb[3];
465: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
466: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
467: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
468: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
469: v += 16;
470: }
471: if (usecprow) z = zarray + 4 * ridx[i];
472: z[0] = sum1;
473: z[1] = sum2;
474: z[2] = sum3;
475: z[3] = sum4;
476: if (!usecprow) z += 4;
477: }
478: PetscCall(VecRestoreArrayRead(xx, &x));
479: PetscCall(VecRestoreArrayWrite(zz, &zarray));
480: PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
481: PetscFunctionReturn(PETSC_SUCCESS);
482: }
484: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
485: {
486: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
487: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
488: const PetscScalar *xb, *x;
489: const MatScalar *v;
490: const PetscInt *idx, *ii, *ridx = NULL;
491: PetscInt mbs, i, j, n;
492: PetscBool usecprow = a->compressedrow.use;
494: PetscFunctionBegin;
495: PetscCall(VecGetArrayRead(xx, &x));
496: PetscCall(VecGetArrayWrite(zz, &zarray));
498: idx = a->j;
499: v = a->a;
500: if (usecprow) {
501: mbs = a->compressedrow.nrows;
502: ii = a->compressedrow.i;
503: ridx = a->compressedrow.rindex;
504: PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
505: } else {
506: mbs = a->mbs;
507: ii = a->i;
508: z = zarray;
509: }
511: for (i = 0; i < mbs; i++) {
512: n = ii[1] - ii[0];
513: ii++;
514: sum1 = 0.0;
515: sum2 = 0.0;
516: sum3 = 0.0;
517: sum4 = 0.0;
518: sum5 = 0.0;
519: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
520: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
521: for (j = 0; j < n; j++) {
522: xb = x + 5 * (*idx++);
523: x1 = xb[0];
524: x2 = xb[1];
525: x3 = xb[2];
526: x4 = xb[3];
527: x5 = xb[4];
528: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
529: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
530: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
531: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
532: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
533: v += 25;
534: }
535: if (usecprow) z = zarray + 5 * ridx[i];
536: z[0] = sum1;
537: z[1] = sum2;
538: z[2] = sum3;
539: z[3] = sum4;
540: z[4] = sum5;
541: if (!usecprow) z += 5;
542: }
543: PetscCall(VecRestoreArrayRead(xx, &x));
544: PetscCall(VecRestoreArrayWrite(zz, &zarray));
545: PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
546: PetscFunctionReturn(PETSC_SUCCESS);
547: }
549: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
550: {
551: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
552: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
553: const PetscScalar *x, *xb;
554: PetscScalar x1, x2, x3, x4, x5, x6, *zarray;
555: const MatScalar *v;
556: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
557: PetscBool usecprow = a->compressedrow.use;
559: PetscFunctionBegin;
560: PetscCall(VecGetArrayRead(xx, &x));
561: PetscCall(VecGetArrayWrite(zz, &zarray));
563: idx = a->j;
564: v = a->a;
565: if (usecprow) {
566: mbs = a->compressedrow.nrows;
567: ii = a->compressedrow.i;
568: ridx = a->compressedrow.rindex;
569: PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
570: } else {
571: mbs = a->mbs;
572: ii = a->i;
573: z = zarray;
574: }
576: for (i = 0; i < mbs; i++) {
577: n = ii[1] - ii[0];
578: ii++;
579: sum1 = 0.0;
580: sum2 = 0.0;
581: sum3 = 0.0;
582: sum4 = 0.0;
583: sum5 = 0.0;
584: sum6 = 0.0;
586: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
587: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
588: for (j = 0; j < n; j++) {
589: xb = x + 6 * (*idx++);
590: x1 = xb[0];
591: x2 = xb[1];
592: x3 = xb[2];
593: x4 = xb[3];
594: x5 = xb[4];
595: x6 = xb[5];
596: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
597: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
598: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
599: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
600: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
601: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
602: v += 36;
603: }
604: if (usecprow) z = zarray + 6 * ridx[i];
605: z[0] = sum1;
606: z[1] = sum2;
607: z[2] = sum3;
608: z[3] = sum4;
609: z[4] = sum5;
610: z[5] = sum6;
611: if (!usecprow) z += 6;
612: }
614: PetscCall(VecRestoreArrayRead(xx, &x));
615: PetscCall(VecRestoreArrayWrite(zz, &zarray));
616: PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
617: PetscFunctionReturn(PETSC_SUCCESS);
618: }
620: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
621: {
622: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
623: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
624: const PetscScalar *x, *xb;
625: PetscScalar x1, x2, x3, x4, x5, x6, x7, *zarray;
626: const MatScalar *v;
627: PetscInt mbs, i, *idx, *ii, j, n, *ridx = NULL;
628: PetscBool usecprow = a->compressedrow.use;
630: PetscFunctionBegin;
631: PetscCall(VecGetArrayRead(xx, &x));
632: PetscCall(VecGetArrayWrite(zz, &zarray));
634: idx = a->j;
635: v = a->a;
636: if (usecprow) {
637: mbs = a->compressedrow.nrows;
638: ii = a->compressedrow.i;
639: ridx = a->compressedrow.rindex;
640: PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
641: } else {
642: mbs = a->mbs;
643: ii = a->i;
644: z = zarray;
645: }
647: for (i = 0; i < mbs; i++) {
648: n = ii[1] - ii[0];
649: ii++;
650: sum1 = 0.0;
651: sum2 = 0.0;
652: sum3 = 0.0;
653: sum4 = 0.0;
654: sum5 = 0.0;
655: sum6 = 0.0;
656: sum7 = 0.0;
658: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
659: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
660: for (j = 0; j < n; j++) {
661: xb = x + 7 * (*idx++);
662: x1 = xb[0];
663: x2 = xb[1];
664: x3 = xb[2];
665: x4 = xb[3];
666: x5 = xb[4];
667: x6 = xb[5];
668: x7 = xb[6];
669: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
670: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
671: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
672: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
673: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
674: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
675: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
676: v += 49;
677: }
678: if (usecprow) z = zarray + 7 * ridx[i];
679: z[0] = sum1;
680: z[1] = sum2;
681: z[2] = sum3;
682: z[3] = sum4;
683: z[4] = sum5;
684: z[5] = sum6;
685: z[6] = sum7;
686: if (!usecprow) z += 7;
687: }
689: PetscCall(VecRestoreArrayRead(xx, &x));
690: PetscCall(VecRestoreArrayWrite(zz, &zarray));
691: PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
692: PetscFunctionReturn(PETSC_SUCCESS);
693: }
695: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
696: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
697: {
698: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
699: PetscScalar *z = NULL, *work, *workt, *zarray;
700: const PetscScalar *x, *xb;
701: const MatScalar *v;
702: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
703: const PetscInt *idx, *ii, *ridx = NULL;
704: PetscInt k;
705: PetscBool usecprow = a->compressedrow.use;
707: __m256d a0, a1, a2, a3, a4, a5;
708: __m256d w0, w1, w2, w3;
709: __m256d z0, z1, z2;
710: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
712: PetscFunctionBegin;
713: PetscCall(VecGetArrayRead(xx, &x));
714: PetscCall(VecGetArrayWrite(zz, &zarray));
716: idx = a->j;
717: v = a->a;
718: if (usecprow) {
719: mbs = a->compressedrow.nrows;
720: ii = a->compressedrow.i;
721: ridx = a->compressedrow.rindex;
722: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
723: } else {
724: mbs = a->mbs;
725: ii = a->i;
726: z = zarray;
727: }
729: if (!a->mult_work) {
730: k = PetscMax(A->rmap->n, A->cmap->n);
731: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
732: }
734: work = a->mult_work;
735: for (i = 0; i < mbs; i++) {
736: n = ii[1] - ii[0];
737: ii++;
738: workt = work;
739: for (j = 0; j < n; j++) {
740: xb = x + bs * (*idx++);
741: for (k = 0; k < bs; k++) workt[k] = xb[k];
742: workt += bs;
743: }
744: if (usecprow) z = zarray + bs * ridx[i];
746: z0 = _mm256_setzero_pd();
747: z1 = _mm256_setzero_pd();
748: z2 = _mm256_setzero_pd();
750: for (j = 0; j < n; j++) {
751: /* first column of a */
752: w0 = _mm256_set1_pd(work[j * 9]);
753: a0 = _mm256_loadu_pd(&v[j * 81]);
754: z0 = _mm256_fmadd_pd(a0, w0, z0);
755: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
756: z1 = _mm256_fmadd_pd(a1, w0, z1);
757: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
758: z2 = _mm256_fmadd_pd(a2, w0, z2);
760: /* second column of a */
761: w1 = _mm256_set1_pd(work[j * 9 + 1]);
762: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
763: z0 = _mm256_fmadd_pd(a0, w1, z0);
764: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
765: z1 = _mm256_fmadd_pd(a1, w1, z1);
766: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
767: z2 = _mm256_fmadd_pd(a2, w1, z2);
769: /* third column of a */
770: w2 = _mm256_set1_pd(work[j * 9 + 2]);
771: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
772: z0 = _mm256_fmadd_pd(a3, w2, z0);
773: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
774: z1 = _mm256_fmadd_pd(a4, w2, z1);
775: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
776: z2 = _mm256_fmadd_pd(a5, w2, z2);
778: /* fourth column of a */
779: w3 = _mm256_set1_pd(work[j * 9 + 3]);
780: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
781: z0 = _mm256_fmadd_pd(a0, w3, z0);
782: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
783: z1 = _mm256_fmadd_pd(a1, w3, z1);
784: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
785: z2 = _mm256_fmadd_pd(a2, w3, z2);
787: /* fifth column of a */
788: w0 = _mm256_set1_pd(work[j * 9 + 4]);
789: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
790: z0 = _mm256_fmadd_pd(a3, w0, z0);
791: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
792: z1 = _mm256_fmadd_pd(a4, w0, z1);
793: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
794: z2 = _mm256_fmadd_pd(a5, w0, z2);
796: /* sixth column of a */
797: w1 = _mm256_set1_pd(work[j * 9 + 5]);
798: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
799: z0 = _mm256_fmadd_pd(a0, w1, z0);
800: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
801: z1 = _mm256_fmadd_pd(a1, w1, z1);
802: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
803: z2 = _mm256_fmadd_pd(a2, w1, z2);
805: /* seventh column of a */
806: w2 = _mm256_set1_pd(work[j * 9 + 6]);
807: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
808: z0 = _mm256_fmadd_pd(a0, w2, z0);
809: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
810: z1 = _mm256_fmadd_pd(a1, w2, z1);
811: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
812: z2 = _mm256_fmadd_pd(a2, w2, z2);
814: /* eighth column of a */
815: w3 = _mm256_set1_pd(work[j * 9 + 7]);
816: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
817: z0 = _mm256_fmadd_pd(a3, w3, z0);
818: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
819: z1 = _mm256_fmadd_pd(a4, w3, z1);
820: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
821: z2 = _mm256_fmadd_pd(a5, w3, z2);
823: /* ninth column of a */
824: w0 = _mm256_set1_pd(work[j * 9 + 8]);
825: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
826: z0 = _mm256_fmadd_pd(a0, w0, z0);
827: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
828: z1 = _mm256_fmadd_pd(a1, w0, z1);
829: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
830: z2 = _mm256_fmadd_pd(a2, w0, z2);
831: }
833: _mm256_storeu_pd(&z[0], z0);
834: _mm256_storeu_pd(&z[4], z1);
835: _mm256_maskstore_pd(&z[8], mask1, z2);
837: v += n * bs2;
838: if (!usecprow) z += bs;
839: }
840: PetscCall(VecRestoreArrayRead(xx, &x));
841: PetscCall(VecRestoreArrayWrite(zz, &zarray));
842: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
843: PetscFunctionReturn(PETSC_SUCCESS);
844: }
845: #endif
847: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
848: {
849: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
850: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
851: const PetscScalar *x, *xb;
852: PetscScalar *zarray, xv;
853: const MatScalar *v;
854: const PetscInt *ii, *ij = a->j, *idx;
855: PetscInt mbs, i, j, k, n, *ridx = NULL;
856: PetscBool usecprow = a->compressedrow.use;
858: PetscFunctionBegin;
859: PetscCall(VecGetArrayRead(xx, &x));
860: PetscCall(VecGetArrayWrite(zz, &zarray));
862: v = a->a;
863: if (usecprow) {
864: mbs = a->compressedrow.nrows;
865: ii = a->compressedrow.i;
866: ridx = a->compressedrow.rindex;
867: PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
868: } else {
869: mbs = a->mbs;
870: ii = a->i;
871: z = zarray;
872: }
874: for (i = 0; i < mbs; i++) {
875: n = ii[i + 1] - ii[i];
876: idx = ij + ii[i];
877: sum1 = 0.0;
878: sum2 = 0.0;
879: sum3 = 0.0;
880: sum4 = 0.0;
881: sum5 = 0.0;
882: sum6 = 0.0;
883: sum7 = 0.0;
884: sum8 = 0.0;
885: sum9 = 0.0;
886: sum10 = 0.0;
887: sum11 = 0.0;
889: for (j = 0; j < n; j++) {
890: xb = x + 11 * (idx[j]);
892: for (k = 0; k < 11; k++) {
893: xv = xb[k];
894: sum1 += v[0] * xv;
895: sum2 += v[1] * xv;
896: sum3 += v[2] * xv;
897: sum4 += v[3] * xv;
898: sum5 += v[4] * xv;
899: sum6 += v[5] * xv;
900: sum7 += v[6] * xv;
901: sum8 += v[7] * xv;
902: sum9 += v[8] * xv;
903: sum10 += v[9] * xv;
904: sum11 += v[10] * xv;
905: v += 11;
906: }
907: }
908: if (usecprow) z = zarray + 11 * ridx[i];
909: z[0] = sum1;
910: z[1] = sum2;
911: z[2] = sum3;
912: z[3] = sum4;
913: z[4] = sum5;
914: z[5] = sum6;
915: z[6] = sum7;
916: z[7] = sum8;
917: z[8] = sum9;
918: z[9] = sum10;
919: z[10] = sum11;
921: if (!usecprow) z += 11;
922: }
924: PetscCall(VecRestoreArrayRead(xx, &x));
925: PetscCall(VecRestoreArrayWrite(zz, &zarray));
926: PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
927: PetscFunctionReturn(PETSC_SUCCESS);
928: }
930: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
931: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
932: {
933: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
934: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
935: const PetscScalar *x, *xb;
936: PetscScalar *zarray, xv;
937: const MatScalar *v;
938: const PetscInt *ii, *ij = a->j, *idx;
939: PetscInt mbs, i, j, k, n, *ridx = NULL;
940: PetscBool usecprow = a->compressedrow.use;
942: PetscFunctionBegin;
943: PetscCall(VecGetArrayRead(xx, &x));
944: PetscCall(VecGetArrayWrite(zz, &zarray));
946: v = a->a;
947: if (usecprow) {
948: mbs = a->compressedrow.nrows;
949: ii = a->compressedrow.i;
950: ridx = a->compressedrow.rindex;
951: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
952: } else {
953: mbs = a->mbs;
954: ii = a->i;
955: z = zarray;
956: }
958: for (i = 0; i < mbs; i++) {
959: n = ii[i + 1] - ii[i];
960: idx = ij + ii[i];
961: sum1 = 0.0;
962: sum2 = 0.0;
963: sum3 = 0.0;
964: sum4 = 0.0;
965: sum5 = 0.0;
966: sum6 = 0.0;
967: sum7 = 0.0;
968: sum8 = 0.0;
969: sum9 = 0.0;
970: sum10 = 0.0;
971: sum11 = 0.0;
972: sum12 = 0.0;
974: for (j = 0; j < n; j++) {
975: xb = x + 12 * (idx[j]);
977: for (k = 0; k < 12; k++) {
978: xv = xb[k];
979: sum1 += v[0] * xv;
980: sum2 += v[1] * xv;
981: sum3 += v[2] * xv;
982: sum4 += v[3] * xv;
983: sum5 += v[4] * xv;
984: sum6 += v[5] * xv;
985: sum7 += v[6] * xv;
986: sum8 += v[7] * xv;
987: sum9 += v[8] * xv;
988: sum10 += v[9] * xv;
989: sum11 += v[10] * xv;
990: sum12 += v[11] * xv;
991: v += 12;
992: }
993: }
994: if (usecprow) z = zarray + 12 * ridx[i];
995: z[0] = sum1;
996: z[1] = sum2;
997: z[2] = sum3;
998: z[3] = sum4;
999: z[4] = sum5;
1000: z[5] = sum6;
1001: z[6] = sum7;
1002: z[7] = sum8;
1003: z[8] = sum9;
1004: z[9] = sum10;
1005: z[10] = sum11;
1006: z[11] = sum12;
1007: if (!usecprow) z += 12;
1008: }
1009: PetscCall(VecRestoreArrayRead(xx, &x));
1010: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1011: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1012: PetscFunctionReturn(PETSC_SUCCESS);
1013: }
1015: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1016: {
1017: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1018: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1019: const PetscScalar *x, *xb;
1020: PetscScalar *zarray, *yarray, xv;
1021: const MatScalar *v;
1022: const PetscInt *ii, *ij = a->j, *idx;
1023: PetscInt mbs = a->mbs, i, j, k, n, *ridx = NULL;
1024: PetscBool usecprow = a->compressedrow.use;
1026: PetscFunctionBegin;
1027: PetscCall(VecGetArrayRead(xx, &x));
1028: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1030: v = a->a;
1031: if (usecprow) {
1032: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1033: mbs = a->compressedrow.nrows;
1034: ii = a->compressedrow.i;
1035: ridx = a->compressedrow.rindex;
1036: } else {
1037: ii = a->i;
1038: y = yarray;
1039: z = zarray;
1040: }
1042: for (i = 0; i < mbs; i++) {
1043: n = ii[i + 1] - ii[i];
1044: idx = ij + ii[i];
1046: if (usecprow) {
1047: y = yarray + 12 * ridx[i];
1048: z = zarray + 12 * ridx[i];
1049: }
1050: sum1 = y[0];
1051: sum2 = y[1];
1052: sum3 = y[2];
1053: sum4 = y[3];
1054: sum5 = y[4];
1055: sum6 = y[5];
1056: sum7 = y[6];
1057: sum8 = y[7];
1058: sum9 = y[8];
1059: sum10 = y[9];
1060: sum11 = y[10];
1061: sum12 = y[11];
1063: for (j = 0; j < n; j++) {
1064: xb = x + 12 * (idx[j]);
1066: for (k = 0; k < 12; k++) {
1067: xv = xb[k];
1068: sum1 += v[0] * xv;
1069: sum2 += v[1] * xv;
1070: sum3 += v[2] * xv;
1071: sum4 += v[3] * xv;
1072: sum5 += v[4] * xv;
1073: sum6 += v[5] * xv;
1074: sum7 += v[6] * xv;
1075: sum8 += v[7] * xv;
1076: sum9 += v[8] * xv;
1077: sum10 += v[9] * xv;
1078: sum11 += v[10] * xv;
1079: sum12 += v[11] * xv;
1080: v += 12;
1081: }
1082: }
1084: z[0] = sum1;
1085: z[1] = sum2;
1086: z[2] = sum3;
1087: z[3] = sum4;
1088: z[4] = sum5;
1089: z[5] = sum6;
1090: z[6] = sum7;
1091: z[7] = sum8;
1092: z[8] = sum9;
1093: z[9] = sum10;
1094: z[10] = sum11;
1095: z[11] = sum12;
1096: if (!usecprow) {
1097: y += 12;
1098: z += 12;
1099: }
1100: }
1101: PetscCall(VecRestoreArrayRead(xx, &x));
1102: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1103: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1104: PetscFunctionReturn(PETSC_SUCCESS);
1105: }
1107: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1108: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1109: {
1110: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1111: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1112: const PetscScalar *x, *xb;
1113: PetscScalar x1, x2, x3, x4, *zarray;
1114: const MatScalar *v;
1115: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1116: PetscInt mbs, i, j, n;
1117: PetscBool usecprow = a->compressedrow.use;
1119: PetscFunctionBegin;
1120: PetscCall(VecGetArrayRead(xx, &x));
1121: PetscCall(VecGetArrayWrite(zz, &zarray));
1123: v = a->a;
1124: if (usecprow) {
1125: mbs = a->compressedrow.nrows;
1126: ii = a->compressedrow.i;
1127: ridx = a->compressedrow.rindex;
1128: PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1129: } else {
1130: mbs = a->mbs;
1131: ii = a->i;
1132: z = zarray;
1133: }
1135: for (i = 0; i < mbs; i++) {
1136: n = ii[i + 1] - ii[i];
1137: idx = ij + ii[i];
1139: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1140: for (j = 0; j < n; j++) {
1141: xb = x + 12 * (idx[j]);
1142: x1 = xb[0];
1143: x2 = xb[1];
1144: x3 = xb[2];
1145: x4 = xb[3];
1147: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1148: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1149: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1150: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1151: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1152: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1153: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1154: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1155: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1156: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1157: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1158: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1159: v += 48;
1161: x1 = xb[4];
1162: x2 = xb[5];
1163: x3 = xb[6];
1164: x4 = xb[7];
1166: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1167: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1168: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1169: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1170: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1171: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1172: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1173: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1174: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1175: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1176: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1177: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1178: v += 48;
1180: x1 = xb[8];
1181: x2 = xb[9];
1182: x3 = xb[10];
1183: x4 = xb[11];
1184: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1185: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1186: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1187: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1188: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1189: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1190: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1191: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1192: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1193: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1194: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1195: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1196: v += 48;
1197: }
1198: if (usecprow) z = zarray + 12 * ridx[i];
1199: z[0] = sum1;
1200: z[1] = sum2;
1201: z[2] = sum3;
1202: z[3] = sum4;
1203: z[4] = sum5;
1204: z[5] = sum6;
1205: z[6] = sum7;
1206: z[7] = sum8;
1207: z[8] = sum9;
1208: z[9] = sum10;
1209: z[10] = sum11;
1210: z[11] = sum12;
1211: if (!usecprow) z += 12;
1212: }
1213: PetscCall(VecRestoreArrayRead(xx, &x));
1214: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1215: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1216: PetscFunctionReturn(PETSC_SUCCESS);
1217: }
1219: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1220: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1221: {
1222: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1223: PetscScalar *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1224: const PetscScalar *x, *xb;
1225: PetscScalar x1, x2, x3, x4, *zarray, *yarray;
1226: const MatScalar *v;
1227: const PetscInt *ii, *ij = a->j, *idx, *ridx = NULL;
1228: PetscInt mbs = a->mbs, i, j, n;
1229: PetscBool usecprow = a->compressedrow.use;
1231: PetscFunctionBegin;
1232: PetscCall(VecGetArrayRead(xx, &x));
1233: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
1235: v = a->a;
1236: if (usecprow) {
1237: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1238: mbs = a->compressedrow.nrows;
1239: ii = a->compressedrow.i;
1240: ridx = a->compressedrow.rindex;
1241: } else {
1242: ii = a->i;
1243: y = yarray;
1244: z = zarray;
1245: }
1247: for (i = 0; i < mbs; i++) {
1248: n = ii[i + 1] - ii[i];
1249: idx = ij + ii[i];
1251: if (usecprow) {
1252: y = yarray + 12 * ridx[i];
1253: z = zarray + 12 * ridx[i];
1254: }
1255: sum1 = y[0];
1256: sum2 = y[1];
1257: sum3 = y[2];
1258: sum4 = y[3];
1259: sum5 = y[4];
1260: sum6 = y[5];
1261: sum7 = y[6];
1262: sum8 = y[7];
1263: sum9 = y[8];
1264: sum10 = y[9];
1265: sum11 = y[10];
1266: sum12 = y[11];
1268: for (j = 0; j < n; j++) {
1269: xb = x + 12 * (idx[j]);
1270: x1 = xb[0];
1271: x2 = xb[1];
1272: x3 = xb[2];
1273: x4 = xb[3];
1275: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1276: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1277: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1278: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1279: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1280: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1281: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1282: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1283: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1284: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1285: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1286: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1287: v += 48;
1289: x1 = xb[4];
1290: x2 = xb[5];
1291: x3 = xb[6];
1292: x4 = xb[7];
1294: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1295: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1296: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1297: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1298: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1299: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1300: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1301: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1302: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1303: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1304: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1305: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1306: v += 48;
1308: x1 = xb[8];
1309: x2 = xb[9];
1310: x3 = xb[10];
1311: x4 = xb[11];
1312: sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1313: sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1314: sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1315: sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1316: sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1317: sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1318: sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1319: sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1320: sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1321: sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1322: sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1323: sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1324: v += 48;
1325: }
1326: z[0] = sum1;
1327: z[1] = sum2;
1328: z[2] = sum3;
1329: z[3] = sum4;
1330: z[4] = sum5;
1331: z[5] = sum6;
1332: z[6] = sum7;
1333: z[7] = sum8;
1334: z[8] = sum9;
1335: z[9] = sum10;
1336: z[10] = sum11;
1337: z[11] = sum12;
1338: if (!usecprow) {
1339: y += 12;
1340: z += 12;
1341: }
1342: }
1343: PetscCall(VecRestoreArrayRead(xx, &x));
1344: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1345: PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1346: PetscFunctionReturn(PETSC_SUCCESS);
1347: }
1349: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1350: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1351: {
1352: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1353: PetscScalar *z = NULL, *zarray;
1354: const PetscScalar *x, *work;
1355: const MatScalar *v = a->a;
1356: PetscInt mbs, i, j, n;
1357: const PetscInt *idx = a->j, *ii, *ridx = NULL;
1358: PetscBool usecprow = a->compressedrow.use;
1359: const PetscInt bs = 12, bs2 = 144;
1361: __m256d a0, a1, a2, a3, a4, a5;
1362: __m256d w0, w1, w2, w3;
1363: __m256d z0, z1, z2;
1365: PetscFunctionBegin;
1366: PetscCall(VecGetArrayRead(xx, &x));
1367: PetscCall(VecGetArrayWrite(zz, &zarray));
1369: if (usecprow) {
1370: mbs = a->compressedrow.nrows;
1371: ii = a->compressedrow.i;
1372: ridx = a->compressedrow.rindex;
1373: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1374: } else {
1375: mbs = a->mbs;
1376: ii = a->i;
1377: z = zarray;
1378: }
1380: for (i = 0; i < mbs; i++) {
1381: z0 = _mm256_setzero_pd();
1382: z1 = _mm256_setzero_pd();
1383: z2 = _mm256_setzero_pd();
1385: n = ii[1] - ii[0];
1386: ii++;
1387: for (j = 0; j < n; j++) {
1388: work = x + bs * (*idx++);
1390: /* first column of a */
1391: w0 = _mm256_set1_pd(work[0]);
1392: a0 = _mm256_loadu_pd(v + 0);
1393: z0 = _mm256_fmadd_pd(a0, w0, z0);
1394: a1 = _mm256_loadu_pd(v + 4);
1395: z1 = _mm256_fmadd_pd(a1, w0, z1);
1396: a2 = _mm256_loadu_pd(v + 8);
1397: z2 = _mm256_fmadd_pd(a2, w0, z2);
1399: /* second column of a */
1400: w1 = _mm256_set1_pd(work[1]);
1401: a3 = _mm256_loadu_pd(v + 12);
1402: z0 = _mm256_fmadd_pd(a3, w1, z0);
1403: a4 = _mm256_loadu_pd(v + 16);
1404: z1 = _mm256_fmadd_pd(a4, w1, z1);
1405: a5 = _mm256_loadu_pd(v + 20);
1406: z2 = _mm256_fmadd_pd(a5, w1, z2);
1408: /* third column of a */
1409: w2 = _mm256_set1_pd(work[2]);
1410: a0 = _mm256_loadu_pd(v + 24);
1411: z0 = _mm256_fmadd_pd(a0, w2, z0);
1412: a1 = _mm256_loadu_pd(v + 28);
1413: z1 = _mm256_fmadd_pd(a1, w2, z1);
1414: a2 = _mm256_loadu_pd(v + 32);
1415: z2 = _mm256_fmadd_pd(a2, w2, z2);
1417: /* fourth column of a */
1418: w3 = _mm256_set1_pd(work[3]);
1419: a3 = _mm256_loadu_pd(v + 36);
1420: z0 = _mm256_fmadd_pd(a3, w3, z0);
1421: a4 = _mm256_loadu_pd(v + 40);
1422: z1 = _mm256_fmadd_pd(a4, w3, z1);
1423: a5 = _mm256_loadu_pd(v + 44);
1424: z2 = _mm256_fmadd_pd(a5, w3, z2);
1426: /* fifth column of a */
1427: w0 = _mm256_set1_pd(work[4]);
1428: a0 = _mm256_loadu_pd(v + 48);
1429: z0 = _mm256_fmadd_pd(a0, w0, z0);
1430: a1 = _mm256_loadu_pd(v + 52);
1431: z1 = _mm256_fmadd_pd(a1, w0, z1);
1432: a2 = _mm256_loadu_pd(v + 56);
1433: z2 = _mm256_fmadd_pd(a2, w0, z2);
1435: /* sixth column of a */
1436: w1 = _mm256_set1_pd(work[5]);
1437: a3 = _mm256_loadu_pd(v + 60);
1438: z0 = _mm256_fmadd_pd(a3, w1, z0);
1439: a4 = _mm256_loadu_pd(v + 64);
1440: z1 = _mm256_fmadd_pd(a4, w1, z1);
1441: a5 = _mm256_loadu_pd(v + 68);
1442: z2 = _mm256_fmadd_pd(a5, w1, z2);
1444: /* seventh column of a */
1445: w2 = _mm256_set1_pd(work[6]);
1446: a0 = _mm256_loadu_pd(v + 72);
1447: z0 = _mm256_fmadd_pd(a0, w2, z0);
1448: a1 = _mm256_loadu_pd(v + 76);
1449: z1 = _mm256_fmadd_pd(a1, w2, z1);
1450: a2 = _mm256_loadu_pd(v + 80);
1451: z2 = _mm256_fmadd_pd(a2, w2, z2);
1453: /* eighth column of a */
1454: w3 = _mm256_set1_pd(work[7]);
1455: a3 = _mm256_loadu_pd(v + 84);
1456: z0 = _mm256_fmadd_pd(a3, w3, z0);
1457: a4 = _mm256_loadu_pd(v + 88);
1458: z1 = _mm256_fmadd_pd(a4, w3, z1);
1459: a5 = _mm256_loadu_pd(v + 92);
1460: z2 = _mm256_fmadd_pd(a5, w3, z2);
1462: /* ninth column of a */
1463: w0 = _mm256_set1_pd(work[8]);
1464: a0 = _mm256_loadu_pd(v + 96);
1465: z0 = _mm256_fmadd_pd(a0, w0, z0);
1466: a1 = _mm256_loadu_pd(v + 100);
1467: z1 = _mm256_fmadd_pd(a1, w0, z1);
1468: a2 = _mm256_loadu_pd(v + 104);
1469: z2 = _mm256_fmadd_pd(a2, w0, z2);
1471: /* tenth column of a */
1472: w1 = _mm256_set1_pd(work[9]);
1473: a3 = _mm256_loadu_pd(v + 108);
1474: z0 = _mm256_fmadd_pd(a3, w1, z0);
1475: a4 = _mm256_loadu_pd(v + 112);
1476: z1 = _mm256_fmadd_pd(a4, w1, z1);
1477: a5 = _mm256_loadu_pd(v + 116);
1478: z2 = _mm256_fmadd_pd(a5, w1, z2);
1480: /* eleventh column of a */
1481: w2 = _mm256_set1_pd(work[10]);
1482: a0 = _mm256_loadu_pd(v + 120);
1483: z0 = _mm256_fmadd_pd(a0, w2, z0);
1484: a1 = _mm256_loadu_pd(v + 124);
1485: z1 = _mm256_fmadd_pd(a1, w2, z1);
1486: a2 = _mm256_loadu_pd(v + 128);
1487: z2 = _mm256_fmadd_pd(a2, w2, z2);
1489: /* twelveth column of a */
1490: w3 = _mm256_set1_pd(work[11]);
1491: a3 = _mm256_loadu_pd(v + 132);
1492: z0 = _mm256_fmadd_pd(a3, w3, z0);
1493: a4 = _mm256_loadu_pd(v + 136);
1494: z1 = _mm256_fmadd_pd(a4, w3, z1);
1495: a5 = _mm256_loadu_pd(v + 140);
1496: z2 = _mm256_fmadd_pd(a5, w3, z2);
1498: v += bs2;
1499: }
1500: if (usecprow) z = zarray + bs * ridx[i];
1501: _mm256_storeu_pd(&z[0], z0);
1502: _mm256_storeu_pd(&z[4], z1);
1503: _mm256_storeu_pd(&z[8], z2);
1504: if (!usecprow) z += bs;
1505: }
1506: PetscCall(VecRestoreArrayRead(xx, &x));
1507: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1508: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1509: PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1511: #endif
1513: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1514: /* Default MatMult for block size 15 */
1515: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1516: {
1517: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1518: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1519: const PetscScalar *x, *xb;
1520: PetscScalar *zarray, xv;
1521: const MatScalar *v;
1522: const PetscInt *ii, *ij = a->j, *idx;
1523: PetscInt mbs, i, j, k, n, *ridx = NULL;
1524: PetscBool usecprow = a->compressedrow.use;
1526: PetscFunctionBegin;
1527: PetscCall(VecGetArrayRead(xx, &x));
1528: PetscCall(VecGetArrayWrite(zz, &zarray));
1530: v = a->a;
1531: if (usecprow) {
1532: mbs = a->compressedrow.nrows;
1533: ii = a->compressedrow.i;
1534: ridx = a->compressedrow.rindex;
1535: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1536: } else {
1537: mbs = a->mbs;
1538: ii = a->i;
1539: z = zarray;
1540: }
1542: for (i = 0; i < mbs; i++) {
1543: n = ii[i + 1] - ii[i];
1544: idx = ij + ii[i];
1545: sum1 = 0.0;
1546: sum2 = 0.0;
1547: sum3 = 0.0;
1548: sum4 = 0.0;
1549: sum5 = 0.0;
1550: sum6 = 0.0;
1551: sum7 = 0.0;
1552: sum8 = 0.0;
1553: sum9 = 0.0;
1554: sum10 = 0.0;
1555: sum11 = 0.0;
1556: sum12 = 0.0;
1557: sum13 = 0.0;
1558: sum14 = 0.0;
1559: sum15 = 0.0;
1561: for (j = 0; j < n; j++) {
1562: xb = x + 15 * (idx[j]);
1564: for (k = 0; k < 15; k++) {
1565: xv = xb[k];
1566: sum1 += v[0] * xv;
1567: sum2 += v[1] * xv;
1568: sum3 += v[2] * xv;
1569: sum4 += v[3] * xv;
1570: sum5 += v[4] * xv;
1571: sum6 += v[5] * xv;
1572: sum7 += v[6] * xv;
1573: sum8 += v[7] * xv;
1574: sum9 += v[8] * xv;
1575: sum10 += v[9] * xv;
1576: sum11 += v[10] * xv;
1577: sum12 += v[11] * xv;
1578: sum13 += v[12] * xv;
1579: sum14 += v[13] * xv;
1580: sum15 += v[14] * xv;
1581: v += 15;
1582: }
1583: }
1584: if (usecprow) z = zarray + 15 * ridx[i];
1585: z[0] = sum1;
1586: z[1] = sum2;
1587: z[2] = sum3;
1588: z[3] = sum4;
1589: z[4] = sum5;
1590: z[5] = sum6;
1591: z[6] = sum7;
1592: z[7] = sum8;
1593: z[8] = sum9;
1594: z[9] = sum10;
1595: z[10] = sum11;
1596: z[11] = sum12;
1597: z[12] = sum13;
1598: z[13] = sum14;
1599: z[14] = sum15;
1601: if (!usecprow) z += 15;
1602: }
1604: PetscCall(VecRestoreArrayRead(xx, &x));
1605: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1606: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1607: PetscFunctionReturn(PETSC_SUCCESS);
1608: }
1610: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1611: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1612: {
1613: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1614: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1615: const PetscScalar *x, *xb;
1616: PetscScalar x1, x2, x3, x4, *zarray;
1617: const MatScalar *v;
1618: const PetscInt *ii, *ij = a->j, *idx;
1619: PetscInt mbs, i, j, n, *ridx = NULL;
1620: PetscBool usecprow = a->compressedrow.use;
1622: PetscFunctionBegin;
1623: PetscCall(VecGetArrayRead(xx, &x));
1624: PetscCall(VecGetArrayWrite(zz, &zarray));
1626: v = a->a;
1627: if (usecprow) {
1628: mbs = a->compressedrow.nrows;
1629: ii = a->compressedrow.i;
1630: ridx = a->compressedrow.rindex;
1631: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1632: } else {
1633: mbs = a->mbs;
1634: ii = a->i;
1635: z = zarray;
1636: }
1638: for (i = 0; i < mbs; i++) {
1639: n = ii[i + 1] - ii[i];
1640: idx = ij + ii[i];
1641: sum1 = 0.0;
1642: sum2 = 0.0;
1643: sum3 = 0.0;
1644: sum4 = 0.0;
1645: sum5 = 0.0;
1646: sum6 = 0.0;
1647: sum7 = 0.0;
1648: sum8 = 0.0;
1649: sum9 = 0.0;
1650: sum10 = 0.0;
1651: sum11 = 0.0;
1652: sum12 = 0.0;
1653: sum13 = 0.0;
1654: sum14 = 0.0;
1655: sum15 = 0.0;
1657: for (j = 0; j < n; j++) {
1658: xb = x + 15 * (idx[j]);
1659: x1 = xb[0];
1660: x2 = xb[1];
1661: x3 = xb[2];
1662: x4 = xb[3];
1664: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1665: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1666: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1667: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1668: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1669: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1670: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1671: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1672: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1673: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1674: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1675: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1676: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1677: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1678: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1680: v += 60;
1682: x1 = xb[4];
1683: x2 = xb[5];
1684: x3 = xb[6];
1685: x4 = xb[7];
1687: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1688: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1689: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1690: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1691: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1692: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1693: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1694: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1695: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1696: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1697: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1698: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1699: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1700: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1701: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1702: v += 60;
1704: x1 = xb[8];
1705: x2 = xb[9];
1706: x3 = xb[10];
1707: x4 = xb[11];
1708: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1709: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1710: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1711: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1712: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1713: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1714: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1715: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1716: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1717: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1718: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1719: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1720: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1721: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1722: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1723: v += 60;
1725: x1 = xb[12];
1726: x2 = xb[13];
1727: x3 = xb[14];
1728: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1729: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1730: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1731: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1732: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1733: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1734: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1735: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1736: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1737: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1738: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1739: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1740: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1741: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1742: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1743: v += 45;
1744: }
1745: if (usecprow) z = zarray + 15 * ridx[i];
1746: z[0] = sum1;
1747: z[1] = sum2;
1748: z[2] = sum3;
1749: z[3] = sum4;
1750: z[4] = sum5;
1751: z[5] = sum6;
1752: z[6] = sum7;
1753: z[7] = sum8;
1754: z[8] = sum9;
1755: z[9] = sum10;
1756: z[10] = sum11;
1757: z[11] = sum12;
1758: z[12] = sum13;
1759: z[13] = sum14;
1760: z[14] = sum15;
1762: if (!usecprow) z += 15;
1763: }
1765: PetscCall(VecRestoreArrayRead(xx, &x));
1766: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1767: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1768: PetscFunctionReturn(PETSC_SUCCESS);
1769: }
1771: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1772: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1773: {
1774: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1775: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1776: const PetscScalar *x, *xb;
1777: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1778: const MatScalar *v;
1779: const PetscInt *ii, *ij = a->j, *idx;
1780: PetscInt mbs, i, j, n, *ridx = NULL;
1781: PetscBool usecprow = a->compressedrow.use;
1783: PetscFunctionBegin;
1784: PetscCall(VecGetArrayRead(xx, &x));
1785: PetscCall(VecGetArrayWrite(zz, &zarray));
1787: v = a->a;
1788: if (usecprow) {
1789: mbs = a->compressedrow.nrows;
1790: ii = a->compressedrow.i;
1791: ridx = a->compressedrow.rindex;
1792: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1793: } else {
1794: mbs = a->mbs;
1795: ii = a->i;
1796: z = zarray;
1797: }
1799: for (i = 0; i < mbs; i++) {
1800: n = ii[i + 1] - ii[i];
1801: idx = ij + ii[i];
1802: sum1 = 0.0;
1803: sum2 = 0.0;
1804: sum3 = 0.0;
1805: sum4 = 0.0;
1806: sum5 = 0.0;
1807: sum6 = 0.0;
1808: sum7 = 0.0;
1809: sum8 = 0.0;
1810: sum9 = 0.0;
1811: sum10 = 0.0;
1812: sum11 = 0.0;
1813: sum12 = 0.0;
1814: sum13 = 0.0;
1815: sum14 = 0.0;
1816: sum15 = 0.0;
1818: for (j = 0; j < n; j++) {
1819: xb = x + 15 * (idx[j]);
1820: x1 = xb[0];
1821: x2 = xb[1];
1822: x3 = xb[2];
1823: x4 = xb[3];
1824: x5 = xb[4];
1825: x6 = xb[5];
1826: x7 = xb[6];
1827: x8 = xb[7];
1829: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1830: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1831: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1832: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1833: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1834: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1835: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1836: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1837: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1838: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1839: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1840: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1841: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1842: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1843: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1844: v += 120;
1846: x1 = xb[8];
1847: x2 = xb[9];
1848: x3 = xb[10];
1849: x4 = xb[11];
1850: x5 = xb[12];
1851: x6 = xb[13];
1852: x7 = xb[14];
1854: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1855: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1856: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1857: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1858: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1859: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1860: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1861: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1862: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1863: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1864: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1865: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1866: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1867: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1868: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1869: v += 105;
1870: }
1871: if (usecprow) z = zarray + 15 * ridx[i];
1872: z[0] = sum1;
1873: z[1] = sum2;
1874: z[2] = sum3;
1875: z[3] = sum4;
1876: z[4] = sum5;
1877: z[5] = sum6;
1878: z[6] = sum7;
1879: z[7] = sum8;
1880: z[8] = sum9;
1881: z[9] = sum10;
1882: z[10] = sum11;
1883: z[11] = sum12;
1884: z[12] = sum13;
1885: z[13] = sum14;
1886: z[14] = sum15;
1888: if (!usecprow) z += 15;
1889: }
1891: PetscCall(VecRestoreArrayRead(xx, &x));
1892: PetscCall(VecRestoreArrayWrite(zz, &zarray));
1893: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1894: PetscFunctionReturn(PETSC_SUCCESS);
1895: }
1897: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1898: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1899: {
1900: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1901: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1902: const PetscScalar *x, *xb;
1903: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1904: const MatScalar *v;
1905: const PetscInt *ii, *ij = a->j, *idx;
1906: PetscInt mbs, i, j, n, *ridx = NULL;
1907: PetscBool usecprow = a->compressedrow.use;
1909: PetscFunctionBegin;
1910: PetscCall(VecGetArrayRead(xx, &x));
1911: PetscCall(VecGetArrayWrite(zz, &zarray));
1913: v = a->a;
1914: if (usecprow) {
1915: mbs = a->compressedrow.nrows;
1916: ii = a->compressedrow.i;
1917: ridx = a->compressedrow.rindex;
1918: PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1919: } else {
1920: mbs = a->mbs;
1921: ii = a->i;
1922: z = zarray;
1923: }
1925: for (i = 0; i < mbs; i++) {
1926: n = ii[i + 1] - ii[i];
1927: idx = ij + ii[i];
1928: sum1 = 0.0;
1929: sum2 = 0.0;
1930: sum3 = 0.0;
1931: sum4 = 0.0;
1932: sum5 = 0.0;
1933: sum6 = 0.0;
1934: sum7 = 0.0;
1935: sum8 = 0.0;
1936: sum9 = 0.0;
1937: sum10 = 0.0;
1938: sum11 = 0.0;
1939: sum12 = 0.0;
1940: sum13 = 0.0;
1941: sum14 = 0.0;
1942: sum15 = 0.0;
1944: for (j = 0; j < n; j++) {
1945: xb = x + 15 * (idx[j]);
1946: x1 = xb[0];
1947: x2 = xb[1];
1948: x3 = xb[2];
1949: x4 = xb[3];
1950: x5 = xb[4];
1951: x6 = xb[5];
1952: x7 = xb[6];
1953: x8 = xb[7];
1954: x9 = xb[8];
1955: x10 = xb[9];
1956: x11 = xb[10];
1957: x12 = xb[11];
1958: x13 = xb[12];
1959: x14 = xb[13];
1960: x15 = xb[14];
1962: sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1963: sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1964: sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1965: sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1966: sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1967: sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1968: sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1969: sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1970: sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1971: sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1972: sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1973: sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1974: sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1975: sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1976: sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1977: v += 225;
1978: }
1979: if (usecprow) z = zarray + 15 * ridx[i];
1980: z[0] = sum1;
1981: z[1] = sum2;
1982: z[2] = sum3;
1983: z[3] = sum4;
1984: z[4] = sum5;
1985: z[5] = sum6;
1986: z[6] = sum7;
1987: z[7] = sum8;
1988: z[8] = sum9;
1989: z[9] = sum10;
1990: z[10] = sum11;
1991: z[11] = sum12;
1992: z[12] = sum13;
1993: z[13] = sum14;
1994: z[14] = sum15;
1996: if (!usecprow) z += 15;
1997: }
1999: PetscCall(VecRestoreArrayRead(xx, &x));
2000: PetscCall(VecRestoreArrayWrite(zz, &zarray));
2001: PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2002: PetscFunctionReturn(PETSC_SUCCESS);
2003: }
2005: /*
2006: This will not work with MatScalar == float because it calls the BLAS
2007: */
2008: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2009: {
2010: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2011: PetscScalar *z = NULL, *work, *workt, *zarray;
2012: const PetscScalar *x, *xb;
2013: const MatScalar *v;
2014: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2015: const PetscInt *idx, *ii, *ridx = NULL;
2016: PetscInt ncols, k;
2017: PetscBool usecprow = a->compressedrow.use;
2019: PetscFunctionBegin;
2020: PetscCall(VecGetArrayRead(xx, &x));
2021: PetscCall(VecGetArrayWrite(zz, &zarray));
2023: idx = a->j;
2024: v = a->a;
2025: if (usecprow) {
2026: mbs = a->compressedrow.nrows;
2027: ii = a->compressedrow.i;
2028: ridx = a->compressedrow.rindex;
2029: PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2030: } else {
2031: mbs = a->mbs;
2032: ii = a->i;
2033: z = zarray;
2034: }
2036: if (!a->mult_work) {
2037: k = PetscMax(A->rmap->n, A->cmap->n);
2038: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2039: }
2040: work = a->mult_work;
2041: for (i = 0; i < mbs; i++) {
2042: n = ii[1] - ii[0];
2043: ii++;
2044: ncols = n * bs;
2045: workt = work;
2046: for (j = 0; j < n; j++) {
2047: xb = x + bs * (*idx++);
2048: for (k = 0; k < bs; k++) workt[k] = xb[k];
2049: workt += bs;
2050: }
2051: if (usecprow) z = zarray + bs * ridx[i];
2052: PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2053: v += n * bs2;
2054: if (!usecprow) z += bs;
2055: }
2056: PetscCall(VecRestoreArrayRead(xx, &x));
2057: PetscCall(VecRestoreArrayWrite(zz, &zarray));
2058: PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2059: PetscFunctionReturn(PETSC_SUCCESS);
2060: }
2062: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2063: {
2064: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2065: const PetscScalar *x;
2066: PetscScalar *y, *z, sum;
2067: const MatScalar *v;
2068: PetscInt mbs = a->mbs, i, n, *ridx = NULL;
2069: const PetscInt *idx, *ii;
2070: PetscBool usecprow = a->compressedrow.use;
2072: PetscFunctionBegin;
2073: PetscCall(VecGetArrayRead(xx, &x));
2074: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
2076: idx = a->j;
2077: v = a->a;
2078: if (usecprow) {
2079: if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2080: mbs = a->compressedrow.nrows;
2081: ii = a->compressedrow.i;
2082: ridx = a->compressedrow.rindex;
2083: } else {
2084: ii = a->i;
2085: }
2087: for (i = 0; i < mbs; i++) {
2088: n = ii[1] - ii[0];
2089: ii++;
2090: if (!usecprow) {
2091: sum = y[i];
2092: } else {
2093: sum = y[ridx[i]];
2094: }
2095: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2096: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2097: PetscSparseDensePlusDot(sum, x, v, idx, n);
2098: v += n;
2099: idx += n;
2100: if (usecprow) {
2101: z[ridx[i]] = sum;
2102: } else {
2103: z[i] = sum;
2104: }
2105: }
2106: PetscCall(VecRestoreArrayRead(xx, &x));
2107: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2108: PetscCall(PetscLogFlops(2.0 * a->nz));
2109: PetscFunctionReturn(PETSC_SUCCESS);
2110: }
2112: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2113: {
2114: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2115: PetscScalar *y = NULL, *z = NULL, sum1, sum2;
2116: const PetscScalar *x, *xb;
2117: PetscScalar x1, x2, *yarray, *zarray;
2118: const MatScalar *v;
2119: PetscInt mbs = a->mbs, i, n, j;
2120: const PetscInt *idx, *ii, *ridx = NULL;
2121: PetscBool usecprow = a->compressedrow.use;
2123: PetscFunctionBegin;
2124: PetscCall(VecGetArrayRead(xx, &x));
2125: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2127: idx = a->j;
2128: v = a->a;
2129: if (usecprow) {
2130: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2131: mbs = a->compressedrow.nrows;
2132: ii = a->compressedrow.i;
2133: ridx = a->compressedrow.rindex;
2134: } else {
2135: ii = a->i;
2136: y = yarray;
2137: z = zarray;
2138: }
2140: for (i = 0; i < mbs; i++) {
2141: n = ii[1] - ii[0];
2142: ii++;
2143: if (usecprow) {
2144: z = zarray + 2 * ridx[i];
2145: y = yarray + 2 * ridx[i];
2146: }
2147: sum1 = y[0];
2148: sum2 = y[1];
2149: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2150: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2151: for (j = 0; j < n; j++) {
2152: xb = x + 2 * (*idx++);
2153: x1 = xb[0];
2154: x2 = xb[1];
2156: sum1 += v[0] * x1 + v[2] * x2;
2157: sum2 += v[1] * x1 + v[3] * x2;
2158: v += 4;
2159: }
2160: z[0] = sum1;
2161: z[1] = sum2;
2162: if (!usecprow) {
2163: z += 2;
2164: y += 2;
2165: }
2166: }
2167: PetscCall(VecRestoreArrayRead(xx, &x));
2168: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2169: PetscCall(PetscLogFlops(4.0 * a->nz));
2170: PetscFunctionReturn(PETSC_SUCCESS);
2171: }
2173: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2174: {
2175: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2176: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2177: const PetscScalar *x, *xb;
2178: const MatScalar *v;
2179: PetscInt mbs = a->mbs, i, j, n;
2180: const PetscInt *idx, *ii, *ridx = NULL;
2181: PetscBool usecprow = a->compressedrow.use;
2183: PetscFunctionBegin;
2184: PetscCall(VecGetArrayRead(xx, &x));
2185: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2187: idx = a->j;
2188: v = a->a;
2189: if (usecprow) {
2190: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2191: mbs = a->compressedrow.nrows;
2192: ii = a->compressedrow.i;
2193: ridx = a->compressedrow.rindex;
2194: } else {
2195: ii = a->i;
2196: y = yarray;
2197: z = zarray;
2198: }
2200: for (i = 0; i < mbs; i++) {
2201: n = ii[1] - ii[0];
2202: ii++;
2203: if (usecprow) {
2204: z = zarray + 3 * ridx[i];
2205: y = yarray + 3 * ridx[i];
2206: }
2207: sum1 = y[0];
2208: sum2 = y[1];
2209: sum3 = y[2];
2210: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2211: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2212: for (j = 0; j < n; j++) {
2213: xb = x + 3 * (*idx++);
2214: x1 = xb[0];
2215: x2 = xb[1];
2216: x3 = xb[2];
2217: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2218: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2219: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2220: v += 9;
2221: }
2222: z[0] = sum1;
2223: z[1] = sum2;
2224: z[2] = sum3;
2225: if (!usecprow) {
2226: z += 3;
2227: y += 3;
2228: }
2229: }
2230: PetscCall(VecRestoreArrayRead(xx, &x));
2231: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2232: PetscCall(PetscLogFlops(18.0 * a->nz));
2233: PetscFunctionReturn(PETSC_SUCCESS);
2234: }
2236: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2237: {
2238: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2239: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2240: const PetscScalar *x, *xb;
2241: const MatScalar *v;
2242: PetscInt mbs = a->mbs, i, j, n;
2243: const PetscInt *idx, *ii, *ridx = NULL;
2244: PetscBool usecprow = a->compressedrow.use;
2246: PetscFunctionBegin;
2247: PetscCall(VecGetArrayRead(xx, &x));
2248: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2250: idx = a->j;
2251: v = a->a;
2252: if (usecprow) {
2253: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2254: mbs = a->compressedrow.nrows;
2255: ii = a->compressedrow.i;
2256: ridx = a->compressedrow.rindex;
2257: } else {
2258: ii = a->i;
2259: y = yarray;
2260: z = zarray;
2261: }
2263: for (i = 0; i < mbs; i++) {
2264: n = ii[1] - ii[0];
2265: ii++;
2266: if (usecprow) {
2267: z = zarray + 4 * ridx[i];
2268: y = yarray + 4 * ridx[i];
2269: }
2270: sum1 = y[0];
2271: sum2 = y[1];
2272: sum3 = y[2];
2273: sum4 = y[3];
2274: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2275: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2276: for (j = 0; j < n; j++) {
2277: xb = x + 4 * (*idx++);
2278: x1 = xb[0];
2279: x2 = xb[1];
2280: x3 = xb[2];
2281: x4 = xb[3];
2282: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2283: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2284: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2285: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2286: v += 16;
2287: }
2288: z[0] = sum1;
2289: z[1] = sum2;
2290: z[2] = sum3;
2291: z[3] = sum4;
2292: if (!usecprow) {
2293: z += 4;
2294: y += 4;
2295: }
2296: }
2297: PetscCall(VecRestoreArrayRead(xx, &x));
2298: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2299: PetscCall(PetscLogFlops(32.0 * a->nz));
2300: PetscFunctionReturn(PETSC_SUCCESS);
2301: }
2303: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2304: {
2305: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2306: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2307: const PetscScalar *x, *xb;
2308: PetscScalar *yarray, *zarray;
2309: const MatScalar *v;
2310: PetscInt mbs = a->mbs, i, j, n;
2311: const PetscInt *idx, *ii, *ridx = NULL;
2312: PetscBool usecprow = a->compressedrow.use;
2314: PetscFunctionBegin;
2315: PetscCall(VecGetArrayRead(xx, &x));
2316: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2318: idx = a->j;
2319: v = a->a;
2320: if (usecprow) {
2321: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2322: mbs = a->compressedrow.nrows;
2323: ii = a->compressedrow.i;
2324: ridx = a->compressedrow.rindex;
2325: } else {
2326: ii = a->i;
2327: y = yarray;
2328: z = zarray;
2329: }
2331: for (i = 0; i < mbs; i++) {
2332: n = ii[1] - ii[0];
2333: ii++;
2334: if (usecprow) {
2335: z = zarray + 5 * ridx[i];
2336: y = yarray + 5 * ridx[i];
2337: }
2338: sum1 = y[0];
2339: sum2 = y[1];
2340: sum3 = y[2];
2341: sum4 = y[3];
2342: sum5 = y[4];
2343: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2344: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2345: for (j = 0; j < n; j++) {
2346: xb = x + 5 * (*idx++);
2347: x1 = xb[0];
2348: x2 = xb[1];
2349: x3 = xb[2];
2350: x4 = xb[3];
2351: x5 = xb[4];
2352: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2353: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2354: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2355: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2356: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2357: v += 25;
2358: }
2359: z[0] = sum1;
2360: z[1] = sum2;
2361: z[2] = sum3;
2362: z[3] = sum4;
2363: z[4] = sum5;
2364: if (!usecprow) {
2365: z += 5;
2366: y += 5;
2367: }
2368: }
2369: PetscCall(VecRestoreArrayRead(xx, &x));
2370: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2371: PetscCall(PetscLogFlops(50.0 * a->nz));
2372: PetscFunctionReturn(PETSC_SUCCESS);
2373: }
2375: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2376: {
2377: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2378: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2379: const PetscScalar *x, *xb;
2380: PetscScalar x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2381: const MatScalar *v;
2382: PetscInt mbs = a->mbs, i, j, n;
2383: const PetscInt *idx, *ii, *ridx = NULL;
2384: PetscBool usecprow = a->compressedrow.use;
2386: PetscFunctionBegin;
2387: PetscCall(VecGetArrayRead(xx, &x));
2388: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2390: idx = a->j;
2391: v = a->a;
2392: if (usecprow) {
2393: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2394: mbs = a->compressedrow.nrows;
2395: ii = a->compressedrow.i;
2396: ridx = a->compressedrow.rindex;
2397: } else {
2398: ii = a->i;
2399: y = yarray;
2400: z = zarray;
2401: }
2403: for (i = 0; i < mbs; i++) {
2404: n = ii[1] - ii[0];
2405: ii++;
2406: if (usecprow) {
2407: z = zarray + 6 * ridx[i];
2408: y = yarray + 6 * ridx[i];
2409: }
2410: sum1 = y[0];
2411: sum2 = y[1];
2412: sum3 = y[2];
2413: sum4 = y[3];
2414: sum5 = y[4];
2415: sum6 = y[5];
2416: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2417: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2418: for (j = 0; j < n; j++) {
2419: xb = x + 6 * (*idx++);
2420: x1 = xb[0];
2421: x2 = xb[1];
2422: x3 = xb[2];
2423: x4 = xb[3];
2424: x5 = xb[4];
2425: x6 = xb[5];
2426: sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2427: sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2428: sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2429: sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2430: sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2431: sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2432: v += 36;
2433: }
2434: z[0] = sum1;
2435: z[1] = sum2;
2436: z[2] = sum3;
2437: z[3] = sum4;
2438: z[4] = sum5;
2439: z[5] = sum6;
2440: if (!usecprow) {
2441: z += 6;
2442: y += 6;
2443: }
2444: }
2445: PetscCall(VecRestoreArrayRead(xx, &x));
2446: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2447: PetscCall(PetscLogFlops(72.0 * a->nz));
2448: PetscFunctionReturn(PETSC_SUCCESS);
2449: }
2451: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2452: {
2453: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2454: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2455: const PetscScalar *x, *xb;
2456: PetscScalar x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2457: const MatScalar *v;
2458: PetscInt mbs = a->mbs, i, j, n;
2459: const PetscInt *idx, *ii, *ridx = NULL;
2460: PetscBool usecprow = a->compressedrow.use;
2462: PetscFunctionBegin;
2463: PetscCall(VecGetArrayRead(xx, &x));
2464: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2466: idx = a->j;
2467: v = a->a;
2468: if (usecprow) {
2469: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2470: mbs = a->compressedrow.nrows;
2471: ii = a->compressedrow.i;
2472: ridx = a->compressedrow.rindex;
2473: } else {
2474: ii = a->i;
2475: y = yarray;
2476: z = zarray;
2477: }
2479: for (i = 0; i < mbs; i++) {
2480: n = ii[1] - ii[0];
2481: ii++;
2482: if (usecprow) {
2483: z = zarray + 7 * ridx[i];
2484: y = yarray + 7 * ridx[i];
2485: }
2486: sum1 = y[0];
2487: sum2 = y[1];
2488: sum3 = y[2];
2489: sum4 = y[3];
2490: sum5 = y[4];
2491: sum6 = y[5];
2492: sum7 = y[6];
2493: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2494: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2495: for (j = 0; j < n; j++) {
2496: xb = x + 7 * (*idx++);
2497: x1 = xb[0];
2498: x2 = xb[1];
2499: x3 = xb[2];
2500: x4 = xb[3];
2501: x5 = xb[4];
2502: x6 = xb[5];
2503: x7 = xb[6];
2504: sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2505: sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2506: sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2507: sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2508: sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2509: sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2510: sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2511: v += 49;
2512: }
2513: z[0] = sum1;
2514: z[1] = sum2;
2515: z[2] = sum3;
2516: z[3] = sum4;
2517: z[4] = sum5;
2518: z[5] = sum6;
2519: z[6] = sum7;
2520: if (!usecprow) {
2521: z += 7;
2522: y += 7;
2523: }
2524: }
2525: PetscCall(VecRestoreArrayRead(xx, &x));
2526: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2527: PetscCall(PetscLogFlops(98.0 * a->nz));
2528: PetscFunctionReturn(PETSC_SUCCESS);
2529: }
2531: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2532: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2533: {
2534: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2535: PetscScalar *z = NULL, *work, *workt, *zarray;
2536: const PetscScalar *x, *xb;
2537: const MatScalar *v;
2538: PetscInt mbs, i, j, n;
2539: PetscInt k;
2540: PetscBool usecprow = a->compressedrow.use;
2541: const PetscInt *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;
2543: __m256d a0, a1, a2, a3, a4, a5;
2544: __m256d w0, w1, w2, w3;
2545: __m256d z0, z1, z2;
2546: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);
2548: PetscFunctionBegin;
2549: PetscCall(VecCopy(yy, zz));
2550: PetscCall(VecGetArrayRead(xx, &x));
2551: PetscCall(VecGetArray(zz, &zarray));
2553: idx = a->j;
2554: v = a->a;
2555: if (usecprow) {
2556: mbs = a->compressedrow.nrows;
2557: ii = a->compressedrow.i;
2558: ridx = a->compressedrow.rindex;
2559: } else {
2560: mbs = a->mbs;
2561: ii = a->i;
2562: z = zarray;
2563: }
2565: if (!a->mult_work) {
2566: k = PetscMax(A->rmap->n, A->cmap->n);
2567: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2568: }
2570: work = a->mult_work;
2571: for (i = 0; i < mbs; i++) {
2572: n = ii[1] - ii[0];
2573: ii++;
2574: workt = work;
2575: for (j = 0; j < n; j++) {
2576: xb = x + bs * (*idx++);
2577: for (k = 0; k < bs; k++) workt[k] = xb[k];
2578: workt += bs;
2579: }
2580: if (usecprow) z = zarray + bs * ridx[i];
2582: z0 = _mm256_loadu_pd(&z[0]);
2583: z1 = _mm256_loadu_pd(&z[4]);
2584: z2 = _mm256_set1_pd(z[8]);
2586: for (j = 0; j < n; j++) {
2587: /* first column of a */
2588: w0 = _mm256_set1_pd(work[j * 9]);
2589: a0 = _mm256_loadu_pd(&v[j * 81]);
2590: z0 = _mm256_fmadd_pd(a0, w0, z0);
2591: a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2592: z1 = _mm256_fmadd_pd(a1, w0, z1);
2593: a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2594: z2 = _mm256_fmadd_pd(a2, w0, z2);
2596: /* second column of a */
2597: w1 = _mm256_set1_pd(work[j * 9 + 1]);
2598: a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2599: z0 = _mm256_fmadd_pd(a0, w1, z0);
2600: a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2601: z1 = _mm256_fmadd_pd(a1, w1, z1);
2602: a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2603: z2 = _mm256_fmadd_pd(a2, w1, z2);
2605: /* third column of a */
2606: w2 = _mm256_set1_pd(work[j * 9 + 2]);
2607: a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2608: z0 = _mm256_fmadd_pd(a3, w2, z0);
2609: a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2610: z1 = _mm256_fmadd_pd(a4, w2, z1);
2611: a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2612: z2 = _mm256_fmadd_pd(a5, w2, z2);
2614: /* fourth column of a */
2615: w3 = _mm256_set1_pd(work[j * 9 + 3]);
2616: a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2617: z0 = _mm256_fmadd_pd(a0, w3, z0);
2618: a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2619: z1 = _mm256_fmadd_pd(a1, w3, z1);
2620: a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2621: z2 = _mm256_fmadd_pd(a2, w3, z2);
2623: /* fifth column of a */
2624: w0 = _mm256_set1_pd(work[j * 9 + 4]);
2625: a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2626: z0 = _mm256_fmadd_pd(a3, w0, z0);
2627: a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2628: z1 = _mm256_fmadd_pd(a4, w0, z1);
2629: a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2630: z2 = _mm256_fmadd_pd(a5, w0, z2);
2632: /* sixth column of a */
2633: w1 = _mm256_set1_pd(work[j * 9 + 5]);
2634: a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2635: z0 = _mm256_fmadd_pd(a0, w1, z0);
2636: a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2637: z1 = _mm256_fmadd_pd(a1, w1, z1);
2638: a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2639: z2 = _mm256_fmadd_pd(a2, w1, z2);
2641: /* seventh column of a */
2642: w2 = _mm256_set1_pd(work[j * 9 + 6]);
2643: a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2644: z0 = _mm256_fmadd_pd(a0, w2, z0);
2645: a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2646: z1 = _mm256_fmadd_pd(a1, w2, z1);
2647: a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2648: z2 = _mm256_fmadd_pd(a2, w2, z2);
2650: /* eighth column of a */
2651: w3 = _mm256_set1_pd(work[j * 9 + 7]);
2652: a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2653: z0 = _mm256_fmadd_pd(a3, w3, z0);
2654: a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2655: z1 = _mm256_fmadd_pd(a4, w3, z1);
2656: a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2657: z2 = _mm256_fmadd_pd(a5, w3, z2);
2659: /* ninth column of a */
2660: w0 = _mm256_set1_pd(work[j * 9 + 8]);
2661: a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2662: z0 = _mm256_fmadd_pd(a0, w0, z0);
2663: a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2664: z1 = _mm256_fmadd_pd(a1, w0, z1);
2665: a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2666: z2 = _mm256_fmadd_pd(a2, w0, z2);
2667: }
2669: _mm256_storeu_pd(&z[0], z0);
2670: _mm256_storeu_pd(&z[4], z1);
2671: _mm256_maskstore_pd(&z[8], mask1, z2);
2673: v += n * bs2;
2674: if (!usecprow) z += bs;
2675: }
2676: PetscCall(VecRestoreArrayRead(xx, &x));
2677: PetscCall(VecRestoreArray(zz, &zarray));
2678: PetscCall(PetscLogFlops(162.0 * a->nz));
2679: PetscFunctionReturn(PETSC_SUCCESS);
2680: }
2681: #endif
2683: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2684: {
2685: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2686: PetscScalar *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2687: const PetscScalar *x, *xb;
2688: PetscScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2689: const MatScalar *v;
2690: PetscInt mbs = a->mbs, i, j, n;
2691: const PetscInt *idx, *ii, *ridx = NULL;
2692: PetscBool usecprow = a->compressedrow.use;
2694: PetscFunctionBegin;
2695: PetscCall(VecGetArrayRead(xx, &x));
2696: PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));
2698: idx = a->j;
2699: v = a->a;
2700: if (usecprow) {
2701: if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2702: mbs = a->compressedrow.nrows;
2703: ii = a->compressedrow.i;
2704: ridx = a->compressedrow.rindex;
2705: } else {
2706: ii = a->i;
2707: y = yarray;
2708: z = zarray;
2709: }
2711: for (i = 0; i < mbs; i++) {
2712: n = ii[1] - ii[0];
2713: ii++;
2714: if (usecprow) {
2715: z = zarray + 11 * ridx[i];
2716: y = yarray + 11 * ridx[i];
2717: }
2718: sum1 = y[0];
2719: sum2 = y[1];
2720: sum3 = y[2];
2721: sum4 = y[3];
2722: sum5 = y[4];
2723: sum6 = y[5];
2724: sum7 = y[6];
2725: sum8 = y[7];
2726: sum9 = y[8];
2727: sum10 = y[9];
2728: sum11 = y[10];
2729: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2730: PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2731: for (j = 0; j < n; j++) {
2732: xb = x + 11 * (*idx++);
2733: x1 = xb[0];
2734: x2 = xb[1];
2735: x3 = xb[2];
2736: x4 = xb[3];
2737: x5 = xb[4];
2738: x6 = xb[5];
2739: x7 = xb[6];
2740: x8 = xb[7];
2741: x9 = xb[8];
2742: x10 = xb[9];
2743: x11 = xb[10];
2744: sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2745: sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2746: sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2747: sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2748: sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2749: sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2750: sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2751: sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2752: sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2753: sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2754: sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2755: v += 121;
2756: }
2757: z[0] = sum1;
2758: z[1] = sum2;
2759: z[2] = sum3;
2760: z[3] = sum4;
2761: z[4] = sum5;
2762: z[5] = sum6;
2763: z[6] = sum7;
2764: z[7] = sum8;
2765: z[8] = sum9;
2766: z[9] = sum10;
2767: z[10] = sum11;
2768: if (!usecprow) {
2769: z += 11;
2770: y += 11;
2771: }
2772: }
2773: PetscCall(VecRestoreArrayRead(xx, &x));
2774: PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2775: PetscCall(PetscLogFlops(242.0 * a->nz));
2776: PetscFunctionReturn(PETSC_SUCCESS);
2777: }
2779: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2780: {
2781: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2782: PetscScalar *z = NULL, *work, *workt, *zarray;
2783: const PetscScalar *x, *xb;
2784: const MatScalar *v;
2785: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2786: PetscInt ncols, k;
2787: const PetscInt *ridx = NULL, *idx, *ii;
2788: PetscBool usecprow = a->compressedrow.use;
2790: PetscFunctionBegin;
2791: PetscCall(VecCopy(yy, zz));
2792: PetscCall(VecGetArrayRead(xx, &x));
2793: PetscCall(VecGetArray(zz, &zarray));
2795: idx = a->j;
2796: v = a->a;
2797: if (usecprow) {
2798: mbs = a->compressedrow.nrows;
2799: ii = a->compressedrow.i;
2800: ridx = a->compressedrow.rindex;
2801: } else {
2802: mbs = a->mbs;
2803: ii = a->i;
2804: z = zarray;
2805: }
2807: if (!a->mult_work) {
2808: k = PetscMax(A->rmap->n, A->cmap->n);
2809: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2810: }
2811: work = a->mult_work;
2812: for (i = 0; i < mbs; i++) {
2813: n = ii[1] - ii[0];
2814: ii++;
2815: ncols = n * bs;
2816: workt = work;
2817: for (j = 0; j < n; j++) {
2818: xb = x + bs * (*idx++);
2819: for (k = 0; k < bs; k++) workt[k] = xb[k];
2820: workt += bs;
2821: }
2822: if (usecprow) z = zarray + bs * ridx[i];
2823: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2824: v += n * bs2;
2825: if (!usecprow) z += bs;
2826: }
2827: PetscCall(VecRestoreArrayRead(xx, &x));
2828: PetscCall(VecRestoreArray(zz, &zarray));
2829: PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2830: PetscFunctionReturn(PETSC_SUCCESS);
2831: }
2833: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2834: {
2835: PetscScalar zero = 0.0;
2837: PetscFunctionBegin;
2838: PetscCall(VecSet(zz, zero));
2839: PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2840: PetscFunctionReturn(PETSC_SUCCESS);
2841: }
2843: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2844: {
2845: PetscScalar zero = 0.0;
2847: PetscFunctionBegin;
2848: PetscCall(VecSet(zz, zero));
2849: PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2850: PetscFunctionReturn(PETSC_SUCCESS);
2851: }
2853: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2854: {
2855: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2856: PetscScalar *z, x1, x2, x3, x4, x5;
2857: const PetscScalar *x, *xb = NULL;
2858: const MatScalar *v;
2859: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n;
2860: const PetscInt *idx, *ii, *ib, *ridx = NULL;
2861: Mat_CompressedRow cprow = a->compressedrow;
2862: PetscBool usecprow = cprow.use;
2864: PetscFunctionBegin;
2865: if (yy != zz) PetscCall(VecCopy(yy, zz));
2866: PetscCall(VecGetArrayRead(xx, &x));
2867: PetscCall(VecGetArray(zz, &z));
2869: idx = a->j;
2870: v = a->a;
2871: if (usecprow) {
2872: mbs = cprow.nrows;
2873: ii = cprow.i;
2874: ridx = cprow.rindex;
2875: } else {
2876: mbs = a->mbs;
2877: ii = a->i;
2878: xb = x;
2879: }
2881: switch (bs) {
2882: case 1:
2883: for (i = 0; i < mbs; i++) {
2884: if (usecprow) xb = x + ridx[i];
2885: x1 = xb[0];
2886: ib = idx + ii[0];
2887: n = ii[1] - ii[0];
2888: ii++;
2889: for (j = 0; j < n; j++) {
2890: rval = ib[j];
2891: z[rval] += PetscConj(*v) * x1;
2892: v++;
2893: }
2894: if (!usecprow) xb++;
2895: }
2896: break;
2897: case 2:
2898: for (i = 0; i < mbs; i++) {
2899: if (usecprow) xb = x + 2 * ridx[i];
2900: x1 = xb[0];
2901: x2 = xb[1];
2902: ib = idx + ii[0];
2903: n = ii[1] - ii[0];
2904: ii++;
2905: for (j = 0; j < n; j++) {
2906: rval = ib[j] * 2;
2907: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2908: z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2909: v += 4;
2910: }
2911: if (!usecprow) xb += 2;
2912: }
2913: break;
2914: case 3:
2915: for (i = 0; i < mbs; i++) {
2916: if (usecprow) xb = x + 3 * ridx[i];
2917: x1 = xb[0];
2918: x2 = xb[1];
2919: x3 = xb[2];
2920: ib = idx + ii[0];
2921: n = ii[1] - ii[0];
2922: ii++;
2923: for (j = 0; j < n; j++) {
2924: rval = ib[j] * 3;
2925: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2926: z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2927: z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2928: v += 9;
2929: }
2930: if (!usecprow) xb += 3;
2931: }
2932: break;
2933: case 4:
2934: for (i = 0; i < mbs; i++) {
2935: if (usecprow) xb = x + 4 * ridx[i];
2936: x1 = xb[0];
2937: x2 = xb[1];
2938: x3 = xb[2];
2939: x4 = xb[3];
2940: ib = idx + ii[0];
2941: n = ii[1] - ii[0];
2942: ii++;
2943: for (j = 0; j < n; j++) {
2944: rval = ib[j] * 4;
2945: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2946: z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2947: z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2948: z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2949: v += 16;
2950: }
2951: if (!usecprow) xb += 4;
2952: }
2953: break;
2954: case 5:
2955: for (i = 0; i < mbs; i++) {
2956: if (usecprow) xb = x + 5 * ridx[i];
2957: x1 = xb[0];
2958: x2 = xb[1];
2959: x3 = xb[2];
2960: x4 = xb[3];
2961: x5 = xb[4];
2962: ib = idx + ii[0];
2963: n = ii[1] - ii[0];
2964: ii++;
2965: for (j = 0; j < n; j++) {
2966: rval = ib[j] * 5;
2967: z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2968: z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2969: z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2970: z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2971: z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2972: v += 25;
2973: }
2974: if (!usecprow) xb += 5;
2975: }
2976: break;
2977: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2978: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2979: #if 0
2980: {
2981: PetscInt ncols,k,bs2=a->bs2;
2982: PetscScalar *work,*workt,zb;
2983: const PetscScalar *xtmp;
2984: if (!a->mult_work) {
2985: k = PetscMax(A->rmap->n,A->cmap->n);
2986: PetscCall(PetscMalloc1(k+1,&a->mult_work));
2987: }
2988: work = a->mult_work;
2989: xtmp = x;
2990: for (i=0; i<mbs; i++) {
2991: n = ii[1] - ii[0]; ii++;
2992: ncols = n*bs;
2993: PetscCall(PetscArrayzero(work,ncols));
2994: if (usecprow) xtmp = x + bs*ridx[i];
2995: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2996: v += n*bs2;
2997: if (!usecprow) xtmp += bs;
2998: workt = work;
2999: for (j=0; j<n; j++) {
3000: zb = z + bs*(*idx++);
3001: for (k=0; k<bs; k++) zb[k] += workt[k] ;
3002: workt += bs;
3003: }
3004: }
3005: }
3006: #endif
3007: }
3008: PetscCall(VecRestoreArrayRead(xx, &x));
3009: PetscCall(VecRestoreArray(zz, &z));
3010: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3011: PetscFunctionReturn(PETSC_SUCCESS);
3012: }
3014: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3015: {
3016: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3017: PetscScalar *zb, *z, x1, x2, x3, x4, x5;
3018: const PetscScalar *x, *xb = NULL;
3019: const MatScalar *v;
3020: PetscInt mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3021: const PetscInt *idx, *ii, *ib, *ridx = NULL;
3022: Mat_CompressedRow cprow = a->compressedrow;
3023: PetscBool usecprow = cprow.use;
3025: PetscFunctionBegin;
3026: if (yy != zz) PetscCall(VecCopy(yy, zz));
3027: PetscCall(VecGetArrayRead(xx, &x));
3028: PetscCall(VecGetArray(zz, &z));
3030: idx = a->j;
3031: v = a->a;
3032: if (usecprow) {
3033: mbs = cprow.nrows;
3034: ii = cprow.i;
3035: ridx = cprow.rindex;
3036: } else {
3037: mbs = a->mbs;
3038: ii = a->i;
3039: xb = x;
3040: }
3042: switch (bs) {
3043: case 1:
3044: for (i = 0; i < mbs; i++) {
3045: if (usecprow) xb = x + ridx[i];
3046: x1 = xb[0];
3047: ib = idx + ii[0];
3048: n = ii[1] - ii[0];
3049: ii++;
3050: for (j = 0; j < n; j++) {
3051: rval = ib[j];
3052: z[rval] += *v * x1;
3053: v++;
3054: }
3055: if (!usecprow) xb++;
3056: }
3057: break;
3058: case 2:
3059: for (i = 0; i < mbs; i++) {
3060: if (usecprow) xb = x + 2 * ridx[i];
3061: x1 = xb[0];
3062: x2 = xb[1];
3063: ib = idx + ii[0];
3064: n = ii[1] - ii[0];
3065: ii++;
3066: for (j = 0; j < n; j++) {
3067: rval = ib[j] * 2;
3068: z[rval++] += v[0] * x1 + v[1] * x2;
3069: z[rval++] += v[2] * x1 + v[3] * x2;
3070: v += 4;
3071: }
3072: if (!usecprow) xb += 2;
3073: }
3074: break;
3075: case 3:
3076: for (i = 0; i < mbs; i++) {
3077: if (usecprow) xb = x + 3 * ridx[i];
3078: x1 = xb[0];
3079: x2 = xb[1];
3080: x3 = xb[2];
3081: ib = idx + ii[0];
3082: n = ii[1] - ii[0];
3083: ii++;
3084: for (j = 0; j < n; j++) {
3085: rval = ib[j] * 3;
3086: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3087: z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3088: z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3089: v += 9;
3090: }
3091: if (!usecprow) xb += 3;
3092: }
3093: break;
3094: case 4:
3095: for (i = 0; i < mbs; i++) {
3096: if (usecprow) xb = x + 4 * ridx[i];
3097: x1 = xb[0];
3098: x2 = xb[1];
3099: x3 = xb[2];
3100: x4 = xb[3];
3101: ib = idx + ii[0];
3102: n = ii[1] - ii[0];
3103: ii++;
3104: for (j = 0; j < n; j++) {
3105: rval = ib[j] * 4;
3106: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3107: z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3108: z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3109: z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3110: v += 16;
3111: }
3112: if (!usecprow) xb += 4;
3113: }
3114: break;
3115: case 5:
3116: for (i = 0; i < mbs; i++) {
3117: if (usecprow) xb = x + 5 * ridx[i];
3118: x1 = xb[0];
3119: x2 = xb[1];
3120: x3 = xb[2];
3121: x4 = xb[3];
3122: x5 = xb[4];
3123: ib = idx + ii[0];
3124: n = ii[1] - ii[0];
3125: ii++;
3126: for (j = 0; j < n; j++) {
3127: rval = ib[j] * 5;
3128: z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3129: z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3130: z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3131: z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3132: z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3133: v += 25;
3134: }
3135: if (!usecprow) xb += 5;
3136: }
3137: break;
3138: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3139: PetscInt ncols, k;
3140: PetscScalar *work, *workt;
3141: const PetscScalar *xtmp;
3142: if (!a->mult_work) {
3143: k = PetscMax(A->rmap->n, A->cmap->n);
3144: PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3145: }
3146: work = a->mult_work;
3147: xtmp = x;
3148: for (i = 0; i < mbs; i++) {
3149: n = ii[1] - ii[0];
3150: ii++;
3151: ncols = n * bs;
3152: PetscCall(PetscArrayzero(work, ncols));
3153: if (usecprow) xtmp = x + bs * ridx[i];
3154: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3155: v += n * bs2;
3156: if (!usecprow) xtmp += bs;
3157: workt = work;
3158: for (j = 0; j < n; j++) {
3159: zb = z + bs * (*idx++);
3160: for (k = 0; k < bs; k++) zb[k] += workt[k];
3161: workt += bs;
3162: }
3163: }
3164: }
3165: }
3166: PetscCall(VecRestoreArrayRead(xx, &x));
3167: PetscCall(VecRestoreArray(zz, &z));
3168: PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3169: PetscFunctionReturn(PETSC_SUCCESS);
3170: }
3172: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3173: {
3174: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
3175: PetscInt totalnz = a->bs2 * a->nz;
3176: PetscScalar oalpha = alpha;
3177: PetscBLASInt one = 1, tnz;
3179: PetscFunctionBegin;
3180: PetscCall(PetscBLASIntCast(totalnz, &tnz));
3181: PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3182: PetscCall(PetscLogFlops(totalnz));
3183: PetscFunctionReturn(PETSC_SUCCESS);
3184: }
3186: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3187: {
3188: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3189: MatScalar *v = a->a;
3190: PetscReal sum = 0.0;
3191: PetscInt i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;
3193: PetscFunctionBegin;
3194: if (type == NORM_FROBENIUS) {
3195: #if defined(PETSC_USE_REAL___FP16)
3196: PetscBLASInt one = 1, cnt = bs2 * nz;
3197: PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3198: #else
3199: for (i = 0; i < bs2 * nz; i++) {
3200: sum += PetscRealPart(PetscConj(*v) * (*v));
3201: v++;
3202: }
3203: #endif
3204: *norm = PetscSqrtReal(sum);
3205: PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3206: } else if (type == NORM_1) { /* maximum column sum */
3207: PetscReal *tmp;
3208: PetscInt *bcol = a->j;
3209: PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3210: for (i = 0; i < nz; i++) {
3211: for (j = 0; j < bs; j++) {
3212: k1 = bs * (*bcol) + j; /* column index */
3213: for (k = 0; k < bs; k++) {
3214: tmp[k1] += PetscAbsScalar(*v);
3215: v++;
3216: }
3217: }
3218: bcol++;
3219: }
3220: *norm = 0.0;
3221: for (j = 0; j < A->cmap->n; j++) {
3222: if (tmp[j] > *norm) *norm = tmp[j];
3223: }
3224: PetscCall(PetscFree(tmp));
3225: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3226: } else if (type == NORM_INFINITY) { /* maximum row sum */
3227: *norm = 0.0;
3228: for (k = 0; k < bs; k++) {
3229: for (j = 0; j < a->mbs; j++) {
3230: v = a->a + bs2 * a->i[j] + k;
3231: sum = 0.0;
3232: for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3233: for (k1 = 0; k1 < bs; k1++) {
3234: sum += PetscAbsScalar(*v);
3235: v += bs;
3236: }
3237: }
3238: if (sum > *norm) *norm = sum;
3239: }
3240: }
3241: PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3242: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3243: PetscFunctionReturn(PETSC_SUCCESS);
3244: }
3246: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3247: {
3248: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;
3250: PetscFunctionBegin;
3251: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
3252: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3253: *flg = PETSC_FALSE;
3254: PetscFunctionReturn(PETSC_SUCCESS);
3255: }
3257: /* if the a->i are the same */
3258: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3259: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3261: /* if a->j are the same */
3262: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3263: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
3265: /* if a->a are the same */
3266: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3267: PetscFunctionReturn(PETSC_SUCCESS);
3268: }
3270: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3271: {
3272: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3273: PetscInt i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3274: PetscScalar *x, zero = 0.0;
3275: MatScalar *aa, *aa_j;
3277: PetscFunctionBegin;
3278: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3279: bs = A->rmap->bs;
3280: aa = a->a;
3281: ai = a->i;
3282: aj = a->j;
3283: ambs = a->mbs;
3284: bs2 = a->bs2;
3286: PetscCall(VecSet(v, zero));
3287: PetscCall(VecGetArray(v, &x));
3288: PetscCall(VecGetLocalSize(v, &n));
3289: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3290: for (i = 0; i < ambs; i++) {
3291: for (j = ai[i]; j < ai[i + 1]; j++) {
3292: if (aj[j] == i) {
3293: row = i * bs;
3294: aa_j = aa + j * bs2;
3295: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3296: break;
3297: }
3298: }
3299: }
3300: PetscCall(VecRestoreArray(v, &x));
3301: PetscFunctionReturn(PETSC_SUCCESS);
3302: }
3304: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3305: {
3306: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3307: const PetscScalar *l, *r, *li, *ri;
3308: PetscScalar x;
3309: MatScalar *aa, *v;
3310: PetscInt i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3311: const PetscInt *ai, *aj;
3313: PetscFunctionBegin;
3314: ai = a->i;
3315: aj = a->j;
3316: aa = a->a;
3317: m = A->rmap->n;
3318: n = A->cmap->n;
3319: bs = A->rmap->bs;
3320: mbs = a->mbs;
3321: bs2 = a->bs2;
3322: if (ll) {
3323: PetscCall(VecGetArrayRead(ll, &l));
3324: PetscCall(VecGetLocalSize(ll, &lm));
3325: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3326: for (i = 0; i < mbs; i++) { /* for each block row */
3327: M = ai[i + 1] - ai[i];
3328: li = l + i * bs;
3329: v = aa + bs2 * ai[i];
3330: for (j = 0; j < M; j++) { /* for each block */
3331: for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3332: }
3333: }
3334: PetscCall(VecRestoreArrayRead(ll, &l));
3335: PetscCall(PetscLogFlops(a->nz));
3336: }
3338: if (rr) {
3339: PetscCall(VecGetArrayRead(rr, &r));
3340: PetscCall(VecGetLocalSize(rr, &rn));
3341: PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3342: for (i = 0; i < mbs; i++) { /* for each block row */
3343: iai = ai[i];
3344: M = ai[i + 1] - iai;
3345: v = aa + bs2 * iai;
3346: for (j = 0; j < M; j++) { /* for each block */
3347: ri = r + bs * aj[iai + j];
3348: for (k = 0; k < bs; k++) {
3349: x = ri[k];
3350: for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3351: v += bs;
3352: }
3353: }
3354: }
3355: PetscCall(VecRestoreArrayRead(rr, &r));
3356: PetscCall(PetscLogFlops(a->nz));
3357: }
3358: PetscFunctionReturn(PETSC_SUCCESS);
3359: }
3361: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3362: {
3363: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3365: PetscFunctionBegin;
3366: info->block_size = a->bs2;
3367: info->nz_allocated = a->bs2 * a->maxnz;
3368: info->nz_used = a->bs2 * a->nz;
3369: info->nz_unneeded = info->nz_allocated - info->nz_used;
3370: info->assemblies = A->num_ass;
3371: info->mallocs = A->info.mallocs;
3372: info->memory = 0; /* REVIEW ME */
3373: if (A->factortype) {
3374: info->fill_ratio_given = A->info.fill_ratio_given;
3375: info->fill_ratio_needed = A->info.fill_ratio_needed;
3376: info->factor_mallocs = A->info.factor_mallocs;
3377: } else {
3378: info->fill_ratio_given = 0;
3379: info->fill_ratio_needed = 0;
3380: info->factor_mallocs = 0;
3381: }
3382: PetscFunctionReturn(PETSC_SUCCESS);
3383: }
3385: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3386: {
3387: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3389: PetscFunctionBegin;
3390: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3391: PetscFunctionReturn(PETSC_SUCCESS);
3392: }
3394: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3395: {
3396: PetscFunctionBegin;
3397: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3398: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3399: PetscFunctionReturn(PETSC_SUCCESS);
3400: }
3402: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3403: {
3404: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3405: PetscScalar *z = NULL, sum1;
3406: const PetscScalar *xb;
3407: PetscScalar x1;
3408: const MatScalar *v, *vv;
3409: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3410: PetscBool usecprow = a->compressedrow.use;
3412: PetscFunctionBegin;
3413: idx = a->j;
3414: v = a->a;
3415: if (usecprow) {
3416: mbs = a->compressedrow.nrows;
3417: ii = a->compressedrow.i;
3418: ridx = a->compressedrow.rindex;
3419: } else {
3420: mbs = a->mbs;
3421: ii = a->i;
3422: z = c;
3423: }
3425: for (i = 0; i < mbs; i++) {
3426: n = ii[1] - ii[0];
3427: ii++;
3428: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3429: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3430: if (usecprow) z = c + ridx[i];
3431: jj = idx;
3432: vv = v;
3433: for (k = 0; k < cn; k++) {
3434: idx = jj;
3435: v = vv;
3436: sum1 = 0.0;
3437: for (j = 0; j < n; j++) {
3438: xb = b + (*idx++);
3439: x1 = xb[0 + k * bm];
3440: sum1 += v[0] * x1;
3441: v += 1;
3442: }
3443: z[0 + k * cm] = sum1;
3444: }
3445: if (!usecprow) z += 1;
3446: }
3447: PetscFunctionReturn(PETSC_SUCCESS);
3448: }
3450: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3451: {
3452: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3453: PetscScalar *z = NULL, sum1, sum2;
3454: const PetscScalar *xb;
3455: PetscScalar x1, x2;
3456: const MatScalar *v, *vv;
3457: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3458: PetscBool usecprow = a->compressedrow.use;
3460: PetscFunctionBegin;
3461: idx = a->j;
3462: v = a->a;
3463: if (usecprow) {
3464: mbs = a->compressedrow.nrows;
3465: ii = a->compressedrow.i;
3466: ridx = a->compressedrow.rindex;
3467: } else {
3468: mbs = a->mbs;
3469: ii = a->i;
3470: z = c;
3471: }
3473: for (i = 0; i < mbs; i++) {
3474: n = ii[1] - ii[0];
3475: ii++;
3476: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3477: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3478: if (usecprow) z = c + 2 * ridx[i];
3479: jj = idx;
3480: vv = v;
3481: for (k = 0; k < cn; k++) {
3482: idx = jj;
3483: v = vv;
3484: sum1 = 0.0;
3485: sum2 = 0.0;
3486: for (j = 0; j < n; j++) {
3487: xb = b + 2 * (*idx++);
3488: x1 = xb[0 + k * bm];
3489: x2 = xb[1 + k * bm];
3490: sum1 += v[0] * x1 + v[2] * x2;
3491: sum2 += v[1] * x1 + v[3] * x2;
3492: v += 4;
3493: }
3494: z[0 + k * cm] = sum1;
3495: z[1 + k * cm] = sum2;
3496: }
3497: if (!usecprow) z += 2;
3498: }
3499: PetscFunctionReturn(PETSC_SUCCESS);
3500: }
3502: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3503: {
3504: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3505: PetscScalar *z = NULL, sum1, sum2, sum3;
3506: const PetscScalar *xb;
3507: PetscScalar x1, x2, x3;
3508: const MatScalar *v, *vv;
3509: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3510: PetscBool usecprow = a->compressedrow.use;
3512: PetscFunctionBegin;
3513: idx = a->j;
3514: v = a->a;
3515: if (usecprow) {
3516: mbs = a->compressedrow.nrows;
3517: ii = a->compressedrow.i;
3518: ridx = a->compressedrow.rindex;
3519: } else {
3520: mbs = a->mbs;
3521: ii = a->i;
3522: z = c;
3523: }
3525: for (i = 0; i < mbs; i++) {
3526: n = ii[1] - ii[0];
3527: ii++;
3528: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3529: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3530: if (usecprow) z = c + 3 * ridx[i];
3531: jj = idx;
3532: vv = v;
3533: for (k = 0; k < cn; k++) {
3534: idx = jj;
3535: v = vv;
3536: sum1 = 0.0;
3537: sum2 = 0.0;
3538: sum3 = 0.0;
3539: for (j = 0; j < n; j++) {
3540: xb = b + 3 * (*idx++);
3541: x1 = xb[0 + k * bm];
3542: x2 = xb[1 + k * bm];
3543: x3 = xb[2 + k * bm];
3544: sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3545: sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3546: sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3547: v += 9;
3548: }
3549: z[0 + k * cm] = sum1;
3550: z[1 + k * cm] = sum2;
3551: z[2 + k * cm] = sum3;
3552: }
3553: if (!usecprow) z += 3;
3554: }
3555: PetscFunctionReturn(PETSC_SUCCESS);
3556: }
3558: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3559: {
3560: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3561: PetscScalar *z = NULL, sum1, sum2, sum3, sum4;
3562: const PetscScalar *xb;
3563: PetscScalar x1, x2, x3, x4;
3564: const MatScalar *v, *vv;
3565: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3566: PetscBool usecprow = a->compressedrow.use;
3568: PetscFunctionBegin;
3569: idx = a->j;
3570: v = a->a;
3571: if (usecprow) {
3572: mbs = a->compressedrow.nrows;
3573: ii = a->compressedrow.i;
3574: ridx = a->compressedrow.rindex;
3575: } else {
3576: mbs = a->mbs;
3577: ii = a->i;
3578: z = c;
3579: }
3581: for (i = 0; i < mbs; i++) {
3582: n = ii[1] - ii[0];
3583: ii++;
3584: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3585: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3586: if (usecprow) z = c + 4 * ridx[i];
3587: jj = idx;
3588: vv = v;
3589: for (k = 0; k < cn; k++) {
3590: idx = jj;
3591: v = vv;
3592: sum1 = 0.0;
3593: sum2 = 0.0;
3594: sum3 = 0.0;
3595: sum4 = 0.0;
3596: for (j = 0; j < n; j++) {
3597: xb = b + 4 * (*idx++);
3598: x1 = xb[0 + k * bm];
3599: x2 = xb[1 + k * bm];
3600: x3 = xb[2 + k * bm];
3601: x4 = xb[3 + k * bm];
3602: sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3603: sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3604: sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3605: sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3606: v += 16;
3607: }
3608: z[0 + k * cm] = sum1;
3609: z[1 + k * cm] = sum2;
3610: z[2 + k * cm] = sum3;
3611: z[3 + k * cm] = sum4;
3612: }
3613: if (!usecprow) z += 4;
3614: }
3615: PetscFunctionReturn(PETSC_SUCCESS);
3616: }
3618: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3619: {
3620: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3621: PetscScalar *z = NULL, sum1, sum2, sum3, sum4, sum5;
3622: const PetscScalar *xb;
3623: PetscScalar x1, x2, x3, x4, x5;
3624: const MatScalar *v, *vv;
3625: PetscInt mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3626: PetscBool usecprow = a->compressedrow.use;
3628: PetscFunctionBegin;
3629: idx = a->j;
3630: v = a->a;
3631: if (usecprow) {
3632: mbs = a->compressedrow.nrows;
3633: ii = a->compressedrow.i;
3634: ridx = a->compressedrow.rindex;
3635: } else {
3636: mbs = a->mbs;
3637: ii = a->i;
3638: z = c;
3639: }
3641: for (i = 0; i < mbs; i++) {
3642: n = ii[1] - ii[0];
3643: ii++;
3644: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3645: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3646: if (usecprow) z = c + 5 * ridx[i];
3647: jj = idx;
3648: vv = v;
3649: for (k = 0; k < cn; k++) {
3650: idx = jj;
3651: v = vv;
3652: sum1 = 0.0;
3653: sum2 = 0.0;
3654: sum3 = 0.0;
3655: sum4 = 0.0;
3656: sum5 = 0.0;
3657: for (j = 0; j < n; j++) {
3658: xb = b + 5 * (*idx++);
3659: x1 = xb[0 + k * bm];
3660: x2 = xb[1 + k * bm];
3661: x3 = xb[2 + k * bm];
3662: x4 = xb[3 + k * bm];
3663: x5 = xb[4 + k * bm];
3664: sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3665: sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3666: sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3667: sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3668: sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3669: v += 25;
3670: }
3671: z[0 + k * cm] = sum1;
3672: z[1 + k * cm] = sum2;
3673: z[2 + k * cm] = sum3;
3674: z[3 + k * cm] = sum4;
3675: z[4 + k * cm] = sum5;
3676: }
3677: if (!usecprow) z += 5;
3678: }
3679: PetscFunctionReturn(PETSC_SUCCESS);
3680: }
3682: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3683: {
3684: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3685: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
3686: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
3687: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3688: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3689: PetscBLASInt bbs, bcn, bbm, bcm;
3690: PetscScalar *z = NULL;
3691: PetscScalar *c, *b;
3692: const MatScalar *v;
3693: const PetscInt *idx, *ii, *ridx = NULL;
3694: PetscScalar _DZero = 0.0, _DOne = 1.0;
3695: PetscBool usecprow = a->compressedrow.use;
3697: PetscFunctionBegin;
3698: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3699: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3700: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3701: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3702: b = bd->v;
3703: if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3704: PetscCall(MatDenseGetArray(C, &c));
3705: switch (bs) {
3706: case 1:
3707: PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3708: break;
3709: case 2:
3710: PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3711: break;
3712: case 3:
3713: PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3714: break;
3715: case 4:
3716: PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3717: break;
3718: case 5:
3719: PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3720: break;
3721: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3722: PetscCall(PetscBLASIntCast(bs, &bbs));
3723: PetscCall(PetscBLASIntCast(cn, &bcn));
3724: PetscCall(PetscBLASIntCast(bm, &bbm));
3725: PetscCall(PetscBLASIntCast(cm, &bcm));
3726: idx = a->j;
3727: v = a->a;
3728: if (usecprow) {
3729: mbs = a->compressedrow.nrows;
3730: ii = a->compressedrow.i;
3731: ridx = a->compressedrow.rindex;
3732: } else {
3733: mbs = a->mbs;
3734: ii = a->i;
3735: z = c;
3736: }
3737: for (i = 0; i < mbs; i++) {
3738: n = ii[1] - ii[0];
3739: ii++;
3740: if (usecprow) z = c + bs * ridx[i];
3741: if (n) {
3742: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3743: v += bs2;
3744: }
3745: for (j = 1; j < n; j++) {
3746: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3747: v += bs2;
3748: }
3749: if (!usecprow) z += bs;
3750: }
3751: }
3752: PetscCall(MatDenseRestoreArray(C, &c));
3753: PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3754: PetscFunctionReturn(PETSC_SUCCESS);
3755: }