Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8:   #include <immintrin.h>
  9: #endif

 11: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 12: {
 13:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
 14:   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
 15:   const PetscInt *idx;
 16:   PetscInt        start, end, *ai, *aj, bs;
 17:   PetscBT         table;

 19:   PetscFunctionBegin;
 20:   m  = a->mbs;
 21:   ai = a->i;
 22:   aj = a->j;
 23:   bs = A->rmap->bs;

 25:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");

 27:   PetscCall(PetscBTCreate(m, &table));
 28:   PetscCall(PetscMalloc1(m + 1, &nidx));

 30:   for (i = 0; i < is_max; i++) {
 31:     /* Initialise the two local arrays */
 32:     isz = 0;
 33:     PetscCall(PetscBTMemzero(m, table));

 35:     /* Extract the indices, assume there can be duplicate entries */
 36:     PetscCall(ISGetIndices(is[i], &idx));
 37:     PetscCall(ISGetLocalSize(is[i], &n));

 39:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 40:     for (j = 0; j < n; ++j) {
 41:       ival = idx[j] / bs; /* convert the indices into block indices */
 42:       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 43:       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
 44:     }
 45:     PetscCall(ISRestoreIndices(is[i], &idx));
 46:     PetscCall(ISDestroy(&is[i]));

 48:     k = 0;
 49:     for (j = 0; j < ov; j++) { /* for each overlap*/
 50:       n = isz;
 51:       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
 52:         row   = nidx[k];
 53:         start = ai[row];
 54:         end   = ai[row + 1];
 55:         for (l = start; l < end; l++) {
 56:           val = aj[l];
 57:           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
 58:         }
 59:       }
 60:     }
 61:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 62:   }
 63:   PetscCall(PetscBTDestroy(&table));
 64:   PetscCall(PetscFree(nidx));
 65:   PetscFunctionReturn(PETSC_SUCCESS);
 66: }

 68: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
 69: {
 70:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
 71:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
 72:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
 73:   const PetscInt *irow, *icol;
 74:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
 75:   PetscInt       *aj = a->j, *ai = a->i;
 76:   MatScalar      *mat_a;
 77:   Mat             C;
 78:   PetscBool       flag;

 80:   PetscFunctionBegin;
 81:   PetscCall(ISGetIndices(isrow, &irow));
 82:   PetscCall(ISGetIndices(iscol, &icol));
 83:   PetscCall(ISGetLocalSize(isrow, &nrows));
 84:   PetscCall(ISGetLocalSize(iscol, &ncols));

 86:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
 87:   ssmap = smap;
 88:   PetscCall(PetscMalloc1(1 + nrows, &lens));
 89:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
 90:   /* determine lens of each row */
 91:   for (i = 0; i < nrows; i++) {
 92:     kstart  = ai[irow[i]];
 93:     kend    = kstart + a->ilen[irow[i]];
 94:     lens[i] = 0;
 95:     for (k = kstart; k < kend; k++) {
 96:       if (ssmap[aj[k]]) lens[i]++;
 97:     }
 98:   }
 99:   /* Create and fill new matrix */
100:   if (scall == MAT_REUSE_MATRIX) {
101:     c = (Mat_SeqBAIJ *)((*B)->data);

103:     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
104:     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
105:     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
106:     PetscCall(PetscArrayzero(c->ilen, c->mbs));
107:     C = *B;
108:   } else {
109:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
110:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
111:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
112:     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
113:   }
114:   c = (Mat_SeqBAIJ *)(C->data);
115:   for (i = 0; i < nrows; i++) {
116:     row      = irow[i];
117:     kstart   = ai[row];
118:     kend     = kstart + a->ilen[row];
119:     mat_i    = c->i[i];
120:     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
121:     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
122:     mat_ilen = c->ilen + i;
123:     for (k = kstart; k < kend; k++) {
124:       if ((tcol = ssmap[a->j[k]])) {
125:         *mat_j++ = tcol - 1;
126:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
127:         mat_a += bs2;
128:         (*mat_ilen)++;
129:       }
130:     }
131:   }
132:   /* sort */
133:   if (c->j && c->a) {
134:     MatScalar *work;
135:     PetscCall(PetscMalloc1(bs2, &work));
136:     for (i = 0; i < nrows; i++) {
137:       PetscInt ilen;
138:       mat_i = c->i[i];
139:       mat_j = c->j + mat_i;
140:       mat_a = c->a + mat_i * bs2;
141:       ilen  = c->ilen[i];
142:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
143:     }
144:     PetscCall(PetscFree(work));
145:   }

147:   /* Free work space */
148:   PetscCall(ISRestoreIndices(iscol, &icol));
149:   PetscCall(PetscFree(smap));
150:   PetscCall(PetscFree(lens));
151:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
152:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

154:   PetscCall(ISRestoreIndices(isrow, &irow));
155:   *B = C;
156:   PetscFunctionReturn(PETSC_SUCCESS);
157: }

159: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
160: {
161:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
162:   IS              is1, is2;
163:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs, j;
164:   const PetscInt *irow, *icol;

166:   PetscFunctionBegin;
167:   PetscCall(ISGetIndices(isrow, &irow));
168:   PetscCall(ISGetIndices(iscol, &icol));
169:   PetscCall(ISGetLocalSize(isrow, &nrows));
170:   PetscCall(ISGetLocalSize(iscol, &ncols));

172:   /* Verify if the indices correspond to each element in a block
173:    and form the IS with compressed IS */
174:   maxmnbs = PetscMax(a->mbs, a->nbs);
175:   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
176:   PetscCall(PetscArrayzero(vary, a->mbs));
177:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
178:   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
179:   count = 0;
180:   for (i = 0; i < nrows; i++) {
181:     j = irow[i] / bs;
182:     if ((vary[j]--) == bs) iary[count++] = j;
183:   }
184:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));

186:   PetscCall(PetscArrayzero(vary, a->nbs));
187:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
188:   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
189:   count = 0;
190:   for (i = 0; i < ncols; i++) {
191:     j = icol[i] / bs;
192:     if ((vary[j]--) == bs) iary[count++] = j;
193:   }
194:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
195:   PetscCall(ISRestoreIndices(isrow, &irow));
196:   PetscCall(ISRestoreIndices(iscol, &icol));
197:   PetscCall(PetscFree2(vary, iary));

199:   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
200:   PetscCall(ISDestroy(&is1));
201:   PetscCall(ISDestroy(&is2));
202:   PetscFunctionReturn(PETSC_SUCCESS);
203: }

205: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
206: {
207:   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
208:   Mat_SubSppt *submatj = c->submatis1;

210:   PetscFunctionBegin;
211:   PetscCall((*submatj->destroy)(C));
212:   PetscCall(MatDestroySubMatrix_Private(submatj));
213:   PetscFunctionReturn(PETSC_SUCCESS);
214: }

216: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
217: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
218: {
219:   PetscInt     i;
220:   Mat          C;
221:   Mat_SeqBAIJ *c;
222:   Mat_SubSppt *submatj;

224:   PetscFunctionBegin;
225:   for (i = 0; i < n; i++) {
226:     C       = (*mat)[i];
227:     c       = (Mat_SeqBAIJ *)C->data;
228:     submatj = c->submatis1;
229:     if (submatj) {
230:       if (--((PetscObject)C)->refct <= 0) {
231:         PetscCall(PetscFree(C->factorprefix));
232:         PetscCall((*submatj->destroy)(C));
233:         PetscCall(MatDestroySubMatrix_Private(submatj));
234:         PetscCall(PetscFree(C->defaultvectype));
235:         PetscCall(PetscFree(C->defaultrandtype));
236:         PetscCall(PetscLayoutDestroy(&C->rmap));
237:         PetscCall(PetscLayoutDestroy(&C->cmap));
238:         PetscCall(PetscHeaderDestroy(&C));
239:       }
240:     } else {
241:       PetscCall(MatDestroy(&C));
242:     }
243:   }

245:   /* Destroy Dummy submatrices created for reuse */
246:   PetscCall(MatDestroySubMatrices_Dummy(n, mat));

248:   PetscCall(PetscFree(*mat));
249:   PetscFunctionReturn(PETSC_SUCCESS);
250: }

252: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
253: {
254:   PetscInt i;

256:   PetscFunctionBegin;
257:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));

259:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
260:   PetscFunctionReturn(PETSC_SUCCESS);
261: }

263: /* Should check that shapes of vectors and matrices match */
264: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
265: {
266:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
267:   PetscScalar       *z, sum;
268:   const PetscScalar *x;
269:   const MatScalar   *v;
270:   PetscInt           mbs, i, n;
271:   const PetscInt    *idx, *ii, *ridx = NULL;
272:   PetscBool          usecprow = a->compressedrow.use;

274:   PetscFunctionBegin;
275:   PetscCall(VecGetArrayRead(xx, &x));
276:   PetscCall(VecGetArrayWrite(zz, &z));

278:   if (usecprow) {
279:     mbs  = a->compressedrow.nrows;
280:     ii   = a->compressedrow.i;
281:     ridx = a->compressedrow.rindex;
282:     PetscCall(PetscArrayzero(z, a->mbs));
283:   } else {
284:     mbs = a->mbs;
285:     ii  = a->i;
286:   }

288:   for (i = 0; i < mbs; i++) {
289:     n   = ii[1] - ii[0];
290:     v   = a->a + ii[0];
291:     idx = a->j + ii[0];
292:     ii++;
293:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
294:     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
295:     sum = 0.0;
296:     PetscSparseDensePlusDot(sum, x, v, idx, n);
297:     if (usecprow) {
298:       z[ridx[i]] = sum;
299:     } else {
300:       z[i] = sum;
301:     }
302:   }
303:   PetscCall(VecRestoreArrayRead(xx, &x));
304:   PetscCall(VecRestoreArrayWrite(zz, &z));
305:   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
306:   PetscFunctionReturn(PETSC_SUCCESS);
307: }

309: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
310: {
311:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
312:   PetscScalar       *z = NULL, sum1, sum2, *zarray;
313:   const PetscScalar *x, *xb;
314:   PetscScalar        x1, x2;
315:   const MatScalar   *v;
316:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
317:   PetscBool          usecprow = a->compressedrow.use;

319:   PetscFunctionBegin;
320:   PetscCall(VecGetArrayRead(xx, &x));
321:   PetscCall(VecGetArrayWrite(zz, &zarray));

323:   idx = a->j;
324:   v   = a->a;
325:   if (usecprow) {
326:     mbs  = a->compressedrow.nrows;
327:     ii   = a->compressedrow.i;
328:     ridx = a->compressedrow.rindex;
329:     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
330:   } else {
331:     mbs = a->mbs;
332:     ii  = a->i;
333:     z   = zarray;
334:   }

336:   for (i = 0; i < mbs; i++) {
337:     n = ii[1] - ii[0];
338:     ii++;
339:     sum1 = 0.0;
340:     sum2 = 0.0;
341:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
342:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
343:     for (j = 0; j < n; j++) {
344:       xb = x + 2 * (*idx++);
345:       x1 = xb[0];
346:       x2 = xb[1];
347:       sum1 += v[0] * x1 + v[2] * x2;
348:       sum2 += v[1] * x1 + v[3] * x2;
349:       v += 4;
350:     }
351:     if (usecprow) z = zarray + 2 * ridx[i];
352:     z[0] = sum1;
353:     z[1] = sum2;
354:     if (!usecprow) z += 2;
355:   }
356:   PetscCall(VecRestoreArrayRead(xx, &x));
357:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
358:   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
359:   PetscFunctionReturn(PETSC_SUCCESS);
360: }

362: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
363: {
364:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
365:   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
366:   const PetscScalar *x, *xb;
367:   const MatScalar   *v;
368:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
369:   PetscBool          usecprow = a->compressedrow.use;

371: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
372:   #pragma disjoint(*v, *z, *xb)
373: #endif

375:   PetscFunctionBegin;
376:   PetscCall(VecGetArrayRead(xx, &x));
377:   PetscCall(VecGetArrayWrite(zz, &zarray));

379:   idx = a->j;
380:   v   = a->a;
381:   if (usecprow) {
382:     mbs  = a->compressedrow.nrows;
383:     ii   = a->compressedrow.i;
384:     ridx = a->compressedrow.rindex;
385:     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
386:   } else {
387:     mbs = a->mbs;
388:     ii  = a->i;
389:     z   = zarray;
390:   }

392:   for (i = 0; i < mbs; i++) {
393:     n = ii[1] - ii[0];
394:     ii++;
395:     sum1 = 0.0;
396:     sum2 = 0.0;
397:     sum3 = 0.0;
398:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
399:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
400:     for (j = 0; j < n; j++) {
401:       xb = x + 3 * (*idx++);
402:       x1 = xb[0];
403:       x2 = xb[1];
404:       x3 = xb[2];

406:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
407:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
408:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
409:       v += 9;
410:     }
411:     if (usecprow) z = zarray + 3 * ridx[i];
412:     z[0] = sum1;
413:     z[1] = sum2;
414:     z[2] = sum3;
415:     if (!usecprow) z += 3;
416:   }
417:   PetscCall(VecRestoreArrayRead(xx, &x));
418:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
419:   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
420:   PetscFunctionReturn(PETSC_SUCCESS);
421: }

423: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
424: {
425:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
426:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
427:   const PetscScalar *x, *xb;
428:   const MatScalar   *v;
429:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
430:   PetscBool          usecprow = a->compressedrow.use;

432:   PetscFunctionBegin;
433:   PetscCall(VecGetArrayRead(xx, &x));
434:   PetscCall(VecGetArrayWrite(zz, &zarray));

436:   idx = a->j;
437:   v   = a->a;
438:   if (usecprow) {
439:     mbs  = a->compressedrow.nrows;
440:     ii   = a->compressedrow.i;
441:     ridx = a->compressedrow.rindex;
442:     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
443:   } else {
444:     mbs = a->mbs;
445:     ii  = a->i;
446:     z   = zarray;
447:   }

449:   for (i = 0; i < mbs; i++) {
450:     n = ii[1] - ii[0];
451:     ii++;
452:     sum1 = 0.0;
453:     sum2 = 0.0;
454:     sum3 = 0.0;
455:     sum4 = 0.0;

457:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
458:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
459:     for (j = 0; j < n; j++) {
460:       xb = x + 4 * (*idx++);
461:       x1 = xb[0];
462:       x2 = xb[1];
463:       x3 = xb[2];
464:       x4 = xb[3];
465:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
466:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
467:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
468:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
469:       v += 16;
470:     }
471:     if (usecprow) z = zarray + 4 * ridx[i];
472:     z[0] = sum1;
473:     z[1] = sum2;
474:     z[2] = sum3;
475:     z[3] = sum4;
476:     if (!usecprow) z += 4;
477:   }
478:   PetscCall(VecRestoreArrayRead(xx, &x));
479:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
480:   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
481:   PetscFunctionReturn(PETSC_SUCCESS);
482: }

484: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
485: {
486:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
487:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
488:   const PetscScalar *xb, *x;
489:   const MatScalar   *v;
490:   const PetscInt    *idx, *ii, *ridx = NULL;
491:   PetscInt           mbs, i, j, n;
492:   PetscBool          usecprow = a->compressedrow.use;

494:   PetscFunctionBegin;
495:   PetscCall(VecGetArrayRead(xx, &x));
496:   PetscCall(VecGetArrayWrite(zz, &zarray));

498:   idx = a->j;
499:   v   = a->a;
500:   if (usecprow) {
501:     mbs  = a->compressedrow.nrows;
502:     ii   = a->compressedrow.i;
503:     ridx = a->compressedrow.rindex;
504:     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
505:   } else {
506:     mbs = a->mbs;
507:     ii  = a->i;
508:     z   = zarray;
509:   }

511:   for (i = 0; i < mbs; i++) {
512:     n = ii[1] - ii[0];
513:     ii++;
514:     sum1 = 0.0;
515:     sum2 = 0.0;
516:     sum3 = 0.0;
517:     sum4 = 0.0;
518:     sum5 = 0.0;
519:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
520:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
521:     for (j = 0; j < n; j++) {
522:       xb = x + 5 * (*idx++);
523:       x1 = xb[0];
524:       x2 = xb[1];
525:       x3 = xb[2];
526:       x4 = xb[3];
527:       x5 = xb[4];
528:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
529:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
530:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
531:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
532:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
533:       v += 25;
534:     }
535:     if (usecprow) z = zarray + 5 * ridx[i];
536:     z[0] = sum1;
537:     z[1] = sum2;
538:     z[2] = sum3;
539:     z[3] = sum4;
540:     z[4] = sum5;
541:     if (!usecprow) z += 5;
542:   }
543:   PetscCall(VecRestoreArrayRead(xx, &x));
544:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
545:   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
546:   PetscFunctionReturn(PETSC_SUCCESS);
547: }

549: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
550: {
551:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
552:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
553:   const PetscScalar *x, *xb;
554:   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
555:   const MatScalar   *v;
556:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
557:   PetscBool          usecprow = a->compressedrow.use;

559:   PetscFunctionBegin;
560:   PetscCall(VecGetArrayRead(xx, &x));
561:   PetscCall(VecGetArrayWrite(zz, &zarray));

563:   idx = a->j;
564:   v   = a->a;
565:   if (usecprow) {
566:     mbs  = a->compressedrow.nrows;
567:     ii   = a->compressedrow.i;
568:     ridx = a->compressedrow.rindex;
569:     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
570:   } else {
571:     mbs = a->mbs;
572:     ii  = a->i;
573:     z   = zarray;
574:   }

576:   for (i = 0; i < mbs; i++) {
577:     n = ii[1] - ii[0];
578:     ii++;
579:     sum1 = 0.0;
580:     sum2 = 0.0;
581:     sum3 = 0.0;
582:     sum4 = 0.0;
583:     sum5 = 0.0;
584:     sum6 = 0.0;

586:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
587:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
588:     for (j = 0; j < n; j++) {
589:       xb = x + 6 * (*idx++);
590:       x1 = xb[0];
591:       x2 = xb[1];
592:       x3 = xb[2];
593:       x4 = xb[3];
594:       x5 = xb[4];
595:       x6 = xb[5];
596:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
597:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
598:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
599:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
600:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
601:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
602:       v += 36;
603:     }
604:     if (usecprow) z = zarray + 6 * ridx[i];
605:     z[0] = sum1;
606:     z[1] = sum2;
607:     z[2] = sum3;
608:     z[3] = sum4;
609:     z[4] = sum5;
610:     z[5] = sum6;
611:     if (!usecprow) z += 6;
612:   }

614:   PetscCall(VecRestoreArrayRead(xx, &x));
615:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
616:   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
617:   PetscFunctionReturn(PETSC_SUCCESS);
618: }

620: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
621: {
622:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
623:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
624:   const PetscScalar *x, *xb;
625:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
626:   const MatScalar   *v;
627:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
628:   PetscBool          usecprow = a->compressedrow.use;

630:   PetscFunctionBegin;
631:   PetscCall(VecGetArrayRead(xx, &x));
632:   PetscCall(VecGetArrayWrite(zz, &zarray));

634:   idx = a->j;
635:   v   = a->a;
636:   if (usecprow) {
637:     mbs  = a->compressedrow.nrows;
638:     ii   = a->compressedrow.i;
639:     ridx = a->compressedrow.rindex;
640:     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
641:   } else {
642:     mbs = a->mbs;
643:     ii  = a->i;
644:     z   = zarray;
645:   }

647:   for (i = 0; i < mbs; i++) {
648:     n = ii[1] - ii[0];
649:     ii++;
650:     sum1 = 0.0;
651:     sum2 = 0.0;
652:     sum3 = 0.0;
653:     sum4 = 0.0;
654:     sum5 = 0.0;
655:     sum6 = 0.0;
656:     sum7 = 0.0;

658:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
659:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
660:     for (j = 0; j < n; j++) {
661:       xb = x + 7 * (*idx++);
662:       x1 = xb[0];
663:       x2 = xb[1];
664:       x3 = xb[2];
665:       x4 = xb[3];
666:       x5 = xb[4];
667:       x6 = xb[5];
668:       x7 = xb[6];
669:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
670:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
671:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
672:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
673:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
674:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
675:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
676:       v += 49;
677:     }
678:     if (usecprow) z = zarray + 7 * ridx[i];
679:     z[0] = sum1;
680:     z[1] = sum2;
681:     z[2] = sum3;
682:     z[3] = sum4;
683:     z[4] = sum5;
684:     z[5] = sum6;
685:     z[6] = sum7;
686:     if (!usecprow) z += 7;
687:   }

689:   PetscCall(VecRestoreArrayRead(xx, &x));
690:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
691:   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
692:   PetscFunctionReturn(PETSC_SUCCESS);
693: }

695: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
696: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
697: {
698:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
699:   PetscScalar       *z = NULL, *work, *workt, *zarray;
700:   const PetscScalar *x, *xb;
701:   const MatScalar   *v;
702:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
703:   const PetscInt    *idx, *ii, *ridx = NULL;
704:   PetscInt           k;
705:   PetscBool          usecprow = a->compressedrow.use;

707:   __m256d a0, a1, a2, a3, a4, a5;
708:   __m256d w0, w1, w2, w3;
709:   __m256d z0, z1, z2;
710:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

712:   PetscFunctionBegin;
713:   PetscCall(VecGetArrayRead(xx, &x));
714:   PetscCall(VecGetArrayWrite(zz, &zarray));

716:   idx = a->j;
717:   v   = a->a;
718:   if (usecprow) {
719:     mbs  = a->compressedrow.nrows;
720:     ii   = a->compressedrow.i;
721:     ridx = a->compressedrow.rindex;
722:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
723:   } else {
724:     mbs = a->mbs;
725:     ii  = a->i;
726:     z   = zarray;
727:   }

729:   if (!a->mult_work) {
730:     k = PetscMax(A->rmap->n, A->cmap->n);
731:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
732:   }

734:   work = a->mult_work;
735:   for (i = 0; i < mbs; i++) {
736:     n = ii[1] - ii[0];
737:     ii++;
738:     workt = work;
739:     for (j = 0; j < n; j++) {
740:       xb = x + bs * (*idx++);
741:       for (k = 0; k < bs; k++) workt[k] = xb[k];
742:       workt += bs;
743:     }
744:     if (usecprow) z = zarray + bs * ridx[i];

746:     z0 = _mm256_setzero_pd();
747:     z1 = _mm256_setzero_pd();
748:     z2 = _mm256_setzero_pd();

750:     for (j = 0; j < n; j++) {
751:       /* first column of a */
752:       w0 = _mm256_set1_pd(work[j * 9]);
753:       a0 = _mm256_loadu_pd(&v[j * 81]);
754:       z0 = _mm256_fmadd_pd(a0, w0, z0);
755:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
756:       z1 = _mm256_fmadd_pd(a1, w0, z1);
757:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
758:       z2 = _mm256_fmadd_pd(a2, w0, z2);

760:       /* second column of a */
761:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
762:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
763:       z0 = _mm256_fmadd_pd(a0, w1, z0);
764:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
765:       z1 = _mm256_fmadd_pd(a1, w1, z1);
766:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
767:       z2 = _mm256_fmadd_pd(a2, w1, z2);

769:       /* third column of a */
770:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
771:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
772:       z0 = _mm256_fmadd_pd(a3, w2, z0);
773:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
774:       z1 = _mm256_fmadd_pd(a4, w2, z1);
775:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
776:       z2 = _mm256_fmadd_pd(a5, w2, z2);

778:       /* fourth column of a */
779:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
780:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
781:       z0 = _mm256_fmadd_pd(a0, w3, z0);
782:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
783:       z1 = _mm256_fmadd_pd(a1, w3, z1);
784:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
785:       z2 = _mm256_fmadd_pd(a2, w3, z2);

787:       /* fifth column of a */
788:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
789:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
790:       z0 = _mm256_fmadd_pd(a3, w0, z0);
791:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
792:       z1 = _mm256_fmadd_pd(a4, w0, z1);
793:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
794:       z2 = _mm256_fmadd_pd(a5, w0, z2);

796:       /* sixth column of a */
797:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
798:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
799:       z0 = _mm256_fmadd_pd(a0, w1, z0);
800:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
801:       z1 = _mm256_fmadd_pd(a1, w1, z1);
802:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
803:       z2 = _mm256_fmadd_pd(a2, w1, z2);

805:       /* seventh column of a */
806:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
807:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
808:       z0 = _mm256_fmadd_pd(a0, w2, z0);
809:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
810:       z1 = _mm256_fmadd_pd(a1, w2, z1);
811:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
812:       z2 = _mm256_fmadd_pd(a2, w2, z2);

814:       /* eighth column of a */
815:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
816:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
817:       z0 = _mm256_fmadd_pd(a3, w3, z0);
818:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
819:       z1 = _mm256_fmadd_pd(a4, w3, z1);
820:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
821:       z2 = _mm256_fmadd_pd(a5, w3, z2);

823:       /* ninth column of a */
824:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
825:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
826:       z0 = _mm256_fmadd_pd(a0, w0, z0);
827:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
828:       z1 = _mm256_fmadd_pd(a1, w0, z1);
829:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
830:       z2 = _mm256_fmadd_pd(a2, w0, z2);
831:     }

833:     _mm256_storeu_pd(&z[0], z0);
834:     _mm256_storeu_pd(&z[4], z1);
835:     _mm256_maskstore_pd(&z[8], mask1, z2);

837:     v += n * bs2;
838:     if (!usecprow) z += bs;
839:   }
840:   PetscCall(VecRestoreArrayRead(xx, &x));
841:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
842:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
843:   PetscFunctionReturn(PETSC_SUCCESS);
844: }
845: #endif

847: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
848: {
849:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
850:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
851:   const PetscScalar *x, *xb;
852:   PetscScalar       *zarray, xv;
853:   const MatScalar   *v;
854:   const PetscInt    *ii, *ij = a->j, *idx;
855:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
856:   PetscBool          usecprow = a->compressedrow.use;

858:   PetscFunctionBegin;
859:   PetscCall(VecGetArrayRead(xx, &x));
860:   PetscCall(VecGetArrayWrite(zz, &zarray));

862:   v = a->a;
863:   if (usecprow) {
864:     mbs  = a->compressedrow.nrows;
865:     ii   = a->compressedrow.i;
866:     ridx = a->compressedrow.rindex;
867:     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
868:   } else {
869:     mbs = a->mbs;
870:     ii  = a->i;
871:     z   = zarray;
872:   }

874:   for (i = 0; i < mbs; i++) {
875:     n     = ii[i + 1] - ii[i];
876:     idx   = ij + ii[i];
877:     sum1  = 0.0;
878:     sum2  = 0.0;
879:     sum3  = 0.0;
880:     sum4  = 0.0;
881:     sum5  = 0.0;
882:     sum6  = 0.0;
883:     sum7  = 0.0;
884:     sum8  = 0.0;
885:     sum9  = 0.0;
886:     sum10 = 0.0;
887:     sum11 = 0.0;

889:     for (j = 0; j < n; j++) {
890:       xb = x + 11 * (idx[j]);

892:       for (k = 0; k < 11; k++) {
893:         xv = xb[k];
894:         sum1 += v[0] * xv;
895:         sum2 += v[1] * xv;
896:         sum3 += v[2] * xv;
897:         sum4 += v[3] * xv;
898:         sum5 += v[4] * xv;
899:         sum6 += v[5] * xv;
900:         sum7 += v[6] * xv;
901:         sum8 += v[7] * xv;
902:         sum9 += v[8] * xv;
903:         sum10 += v[9] * xv;
904:         sum11 += v[10] * xv;
905:         v += 11;
906:       }
907:     }
908:     if (usecprow) z = zarray + 11 * ridx[i];
909:     z[0]  = sum1;
910:     z[1]  = sum2;
911:     z[2]  = sum3;
912:     z[3]  = sum4;
913:     z[4]  = sum5;
914:     z[5]  = sum6;
915:     z[6]  = sum7;
916:     z[7]  = sum8;
917:     z[8]  = sum9;
918:     z[9]  = sum10;
919:     z[10] = sum11;

921:     if (!usecprow) z += 11;
922:   }

924:   PetscCall(VecRestoreArrayRead(xx, &x));
925:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
926:   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
927:   PetscFunctionReturn(PETSC_SUCCESS);
928: }

930: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
931: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
932: {
933:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
934:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
935:   const PetscScalar *x, *xb;
936:   PetscScalar       *zarray, xv;
937:   const MatScalar   *v;
938:   const PetscInt    *ii, *ij = a->j, *idx;
939:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
940:   PetscBool          usecprow = a->compressedrow.use;

942:   PetscFunctionBegin;
943:   PetscCall(VecGetArrayRead(xx, &x));
944:   PetscCall(VecGetArrayWrite(zz, &zarray));

946:   v = a->a;
947:   if (usecprow) {
948:     mbs  = a->compressedrow.nrows;
949:     ii   = a->compressedrow.i;
950:     ridx = a->compressedrow.rindex;
951:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
952:   } else {
953:     mbs = a->mbs;
954:     ii  = a->i;
955:     z   = zarray;
956:   }

958:   for (i = 0; i < mbs; i++) {
959:     n     = ii[i + 1] - ii[i];
960:     idx   = ij + ii[i];
961:     sum1  = 0.0;
962:     sum2  = 0.0;
963:     sum3  = 0.0;
964:     sum4  = 0.0;
965:     sum5  = 0.0;
966:     sum6  = 0.0;
967:     sum7  = 0.0;
968:     sum8  = 0.0;
969:     sum9  = 0.0;
970:     sum10 = 0.0;
971:     sum11 = 0.0;
972:     sum12 = 0.0;

974:     for (j = 0; j < n; j++) {
975:       xb = x + 12 * (idx[j]);

977:       for (k = 0; k < 12; k++) {
978:         xv = xb[k];
979:         sum1 += v[0] * xv;
980:         sum2 += v[1] * xv;
981:         sum3 += v[2] * xv;
982:         sum4 += v[3] * xv;
983:         sum5 += v[4] * xv;
984:         sum6 += v[5] * xv;
985:         sum7 += v[6] * xv;
986:         sum8 += v[7] * xv;
987:         sum9 += v[8] * xv;
988:         sum10 += v[9] * xv;
989:         sum11 += v[10] * xv;
990:         sum12 += v[11] * xv;
991:         v += 12;
992:       }
993:     }
994:     if (usecprow) z = zarray + 12 * ridx[i];
995:     z[0]  = sum1;
996:     z[1]  = sum2;
997:     z[2]  = sum3;
998:     z[3]  = sum4;
999:     z[4]  = sum5;
1000:     z[5]  = sum6;
1001:     z[6]  = sum7;
1002:     z[7]  = sum8;
1003:     z[8]  = sum9;
1004:     z[9]  = sum10;
1005:     z[10] = sum11;
1006:     z[11] = sum12;
1007:     if (!usecprow) z += 12;
1008:   }
1009:   PetscCall(VecRestoreArrayRead(xx, &x));
1010:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1011:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1012:   PetscFunctionReturn(PETSC_SUCCESS);
1013: }

1015: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
1016: {
1017:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1018:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1019:   const PetscScalar *x, *xb;
1020:   PetscScalar       *zarray, *yarray, xv;
1021:   const MatScalar   *v;
1022:   const PetscInt    *ii, *ij = a->j, *idx;
1023:   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
1024:   PetscBool          usecprow = a->compressedrow.use;

1026:   PetscFunctionBegin;
1027:   PetscCall(VecGetArrayRead(xx, &x));
1028:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1030:   v = a->a;
1031:   if (usecprow) {
1032:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1033:     mbs  = a->compressedrow.nrows;
1034:     ii   = a->compressedrow.i;
1035:     ridx = a->compressedrow.rindex;
1036:   } else {
1037:     ii = a->i;
1038:     y  = yarray;
1039:     z  = zarray;
1040:   }

1042:   for (i = 0; i < mbs; i++) {
1043:     n   = ii[i + 1] - ii[i];
1044:     idx = ij + ii[i];

1046:     if (usecprow) {
1047:       y = yarray + 12 * ridx[i];
1048:       z = zarray + 12 * ridx[i];
1049:     }
1050:     sum1  = y[0];
1051:     sum2  = y[1];
1052:     sum3  = y[2];
1053:     sum4  = y[3];
1054:     sum5  = y[4];
1055:     sum6  = y[5];
1056:     sum7  = y[6];
1057:     sum8  = y[7];
1058:     sum9  = y[8];
1059:     sum10 = y[9];
1060:     sum11 = y[10];
1061:     sum12 = y[11];

1063:     for (j = 0; j < n; j++) {
1064:       xb = x + 12 * (idx[j]);

1066:       for (k = 0; k < 12; k++) {
1067:         xv = xb[k];
1068:         sum1 += v[0] * xv;
1069:         sum2 += v[1] * xv;
1070:         sum3 += v[2] * xv;
1071:         sum4 += v[3] * xv;
1072:         sum5 += v[4] * xv;
1073:         sum6 += v[5] * xv;
1074:         sum7 += v[6] * xv;
1075:         sum8 += v[7] * xv;
1076:         sum9 += v[8] * xv;
1077:         sum10 += v[9] * xv;
1078:         sum11 += v[10] * xv;
1079:         sum12 += v[11] * xv;
1080:         v += 12;
1081:       }
1082:     }

1084:     z[0]  = sum1;
1085:     z[1]  = sum2;
1086:     z[2]  = sum3;
1087:     z[3]  = sum4;
1088:     z[4]  = sum5;
1089:     z[5]  = sum6;
1090:     z[6]  = sum7;
1091:     z[7]  = sum8;
1092:     z[8]  = sum9;
1093:     z[9]  = sum10;
1094:     z[10] = sum11;
1095:     z[11] = sum12;
1096:     if (!usecprow) {
1097:       y += 12;
1098:       z += 12;
1099:     }
1100:   }
1101:   PetscCall(VecRestoreArrayRead(xx, &x));
1102:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1103:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1104:   PetscFunctionReturn(PETSC_SUCCESS);
1105: }

1107: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1108: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1109: {
1110:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1111:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1112:   const PetscScalar *x, *xb;
1113:   PetscScalar        x1, x2, x3, x4, *zarray;
1114:   const MatScalar   *v;
1115:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1116:   PetscInt           mbs, i, j, n;
1117:   PetscBool          usecprow = a->compressedrow.use;

1119:   PetscFunctionBegin;
1120:   PetscCall(VecGetArrayRead(xx, &x));
1121:   PetscCall(VecGetArrayWrite(zz, &zarray));

1123:   v = a->a;
1124:   if (usecprow) {
1125:     mbs  = a->compressedrow.nrows;
1126:     ii   = a->compressedrow.i;
1127:     ridx = a->compressedrow.rindex;
1128:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1129:   } else {
1130:     mbs = a->mbs;
1131:     ii  = a->i;
1132:     z   = zarray;
1133:   }

1135:   for (i = 0; i < mbs; i++) {
1136:     n   = ii[i + 1] - ii[i];
1137:     idx = ij + ii[i];

1139:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1140:     for (j = 0; j < n; j++) {
1141:       xb = x + 12 * (idx[j]);
1142:       x1 = xb[0];
1143:       x2 = xb[1];
1144:       x3 = xb[2];
1145:       x4 = xb[3];

1147:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1148:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1149:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1150:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1151:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1152:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1153:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1154:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1155:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1156:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1157:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1158:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1159:       v += 48;

1161:       x1 = xb[4];
1162:       x2 = xb[5];
1163:       x3 = xb[6];
1164:       x4 = xb[7];

1166:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1167:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1168:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1169:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1170:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1171:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1172:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1173:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1174:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1175:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1176:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1177:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1178:       v += 48;

1180:       x1 = xb[8];
1181:       x2 = xb[9];
1182:       x3 = xb[10];
1183:       x4 = xb[11];
1184:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1185:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1186:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1187:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1188:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1189:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1190:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1191:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1192:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1193:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1194:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1195:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1196:       v += 48;
1197:     }
1198:     if (usecprow) z = zarray + 12 * ridx[i];
1199:     z[0]  = sum1;
1200:     z[1]  = sum2;
1201:     z[2]  = sum3;
1202:     z[3]  = sum4;
1203:     z[4]  = sum5;
1204:     z[5]  = sum6;
1205:     z[6]  = sum7;
1206:     z[7]  = sum8;
1207:     z[8]  = sum9;
1208:     z[9]  = sum10;
1209:     z[10] = sum11;
1210:     z[11] = sum12;
1211:     if (!usecprow) z += 12;
1212:   }
1213:   PetscCall(VecRestoreArrayRead(xx, &x));
1214:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1215:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1216:   PetscFunctionReturn(PETSC_SUCCESS);
1217: }

1219: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1220: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1221: {
1222:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1223:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1224:   const PetscScalar *x, *xb;
1225:   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1226:   const MatScalar   *v;
1227:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1228:   PetscInt           mbs      = a->mbs, i, j, n;
1229:   PetscBool          usecprow = a->compressedrow.use;

1231:   PetscFunctionBegin;
1232:   PetscCall(VecGetArrayRead(xx, &x));
1233:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1235:   v = a->a;
1236:   if (usecprow) {
1237:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1238:     mbs  = a->compressedrow.nrows;
1239:     ii   = a->compressedrow.i;
1240:     ridx = a->compressedrow.rindex;
1241:   } else {
1242:     ii = a->i;
1243:     y  = yarray;
1244:     z  = zarray;
1245:   }

1247:   for (i = 0; i < mbs; i++) {
1248:     n   = ii[i + 1] - ii[i];
1249:     idx = ij + ii[i];

1251:     if (usecprow) {
1252:       y = yarray + 12 * ridx[i];
1253:       z = zarray + 12 * ridx[i];
1254:     }
1255:     sum1  = y[0];
1256:     sum2  = y[1];
1257:     sum3  = y[2];
1258:     sum4  = y[3];
1259:     sum5  = y[4];
1260:     sum6  = y[5];
1261:     sum7  = y[6];
1262:     sum8  = y[7];
1263:     sum9  = y[8];
1264:     sum10 = y[9];
1265:     sum11 = y[10];
1266:     sum12 = y[11];

1268:     for (j = 0; j < n; j++) {
1269:       xb = x + 12 * (idx[j]);
1270:       x1 = xb[0];
1271:       x2 = xb[1];
1272:       x3 = xb[2];
1273:       x4 = xb[3];

1275:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1276:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1277:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1278:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1279:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1280:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1281:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1282:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1283:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1284:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1285:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1286:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1287:       v += 48;

1289:       x1 = xb[4];
1290:       x2 = xb[5];
1291:       x3 = xb[6];
1292:       x4 = xb[7];

1294:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1295:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1296:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1297:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1298:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1299:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1300:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1301:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1302:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1303:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1304:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1305:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1306:       v += 48;

1308:       x1 = xb[8];
1309:       x2 = xb[9];
1310:       x3 = xb[10];
1311:       x4 = xb[11];
1312:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1313:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1314:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1315:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1316:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1317:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1318:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1319:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1320:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1321:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1322:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1323:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1324:       v += 48;
1325:     }
1326:     z[0]  = sum1;
1327:     z[1]  = sum2;
1328:     z[2]  = sum3;
1329:     z[3]  = sum4;
1330:     z[4]  = sum5;
1331:     z[5]  = sum6;
1332:     z[6]  = sum7;
1333:     z[7]  = sum8;
1334:     z[8]  = sum9;
1335:     z[9]  = sum10;
1336:     z[10] = sum11;
1337:     z[11] = sum12;
1338:     if (!usecprow) {
1339:       y += 12;
1340:       z += 12;
1341:     }
1342:   }
1343:   PetscCall(VecRestoreArrayRead(xx, &x));
1344:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1345:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1346:   PetscFunctionReturn(PETSC_SUCCESS);
1347: }

1349: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1350: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1351: {
1352:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1353:   PetscScalar       *z = NULL, *zarray;
1354:   const PetscScalar *x, *work;
1355:   const MatScalar   *v = a->a;
1356:   PetscInt           mbs, i, j, n;
1357:   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1358:   PetscBool          usecprow = a->compressedrow.use;
1359:   const PetscInt     bs = 12, bs2 = 144;

1361:   __m256d a0, a1, a2, a3, a4, a5;
1362:   __m256d w0, w1, w2, w3;
1363:   __m256d z0, z1, z2;

1365:   PetscFunctionBegin;
1366:   PetscCall(VecGetArrayRead(xx, &x));
1367:   PetscCall(VecGetArrayWrite(zz, &zarray));

1369:   if (usecprow) {
1370:     mbs  = a->compressedrow.nrows;
1371:     ii   = a->compressedrow.i;
1372:     ridx = a->compressedrow.rindex;
1373:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1374:   } else {
1375:     mbs = a->mbs;
1376:     ii  = a->i;
1377:     z   = zarray;
1378:   }

1380:   for (i = 0; i < mbs; i++) {
1381:     z0 = _mm256_setzero_pd();
1382:     z1 = _mm256_setzero_pd();
1383:     z2 = _mm256_setzero_pd();

1385:     n = ii[1] - ii[0];
1386:     ii++;
1387:     for (j = 0; j < n; j++) {
1388:       work = x + bs * (*idx++);

1390:       /* first column of a */
1391:       w0 = _mm256_set1_pd(work[0]);
1392:       a0 = _mm256_loadu_pd(v + 0);
1393:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1394:       a1 = _mm256_loadu_pd(v + 4);
1395:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1396:       a2 = _mm256_loadu_pd(v + 8);
1397:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1399:       /* second column of a */
1400:       w1 = _mm256_set1_pd(work[1]);
1401:       a3 = _mm256_loadu_pd(v + 12);
1402:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1403:       a4 = _mm256_loadu_pd(v + 16);
1404:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1405:       a5 = _mm256_loadu_pd(v + 20);
1406:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1408:       /* third column of a */
1409:       w2 = _mm256_set1_pd(work[2]);
1410:       a0 = _mm256_loadu_pd(v + 24);
1411:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1412:       a1 = _mm256_loadu_pd(v + 28);
1413:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1414:       a2 = _mm256_loadu_pd(v + 32);
1415:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1417:       /* fourth column of a */
1418:       w3 = _mm256_set1_pd(work[3]);
1419:       a3 = _mm256_loadu_pd(v + 36);
1420:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1421:       a4 = _mm256_loadu_pd(v + 40);
1422:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1423:       a5 = _mm256_loadu_pd(v + 44);
1424:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1426:       /* fifth column of a */
1427:       w0 = _mm256_set1_pd(work[4]);
1428:       a0 = _mm256_loadu_pd(v + 48);
1429:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1430:       a1 = _mm256_loadu_pd(v + 52);
1431:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1432:       a2 = _mm256_loadu_pd(v + 56);
1433:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1435:       /* sixth column of a */
1436:       w1 = _mm256_set1_pd(work[5]);
1437:       a3 = _mm256_loadu_pd(v + 60);
1438:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1439:       a4 = _mm256_loadu_pd(v + 64);
1440:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1441:       a5 = _mm256_loadu_pd(v + 68);
1442:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1444:       /* seventh column of a */
1445:       w2 = _mm256_set1_pd(work[6]);
1446:       a0 = _mm256_loadu_pd(v + 72);
1447:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1448:       a1 = _mm256_loadu_pd(v + 76);
1449:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1450:       a2 = _mm256_loadu_pd(v + 80);
1451:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1453:       /* eighth column of a */
1454:       w3 = _mm256_set1_pd(work[7]);
1455:       a3 = _mm256_loadu_pd(v + 84);
1456:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1457:       a4 = _mm256_loadu_pd(v + 88);
1458:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1459:       a5 = _mm256_loadu_pd(v + 92);
1460:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1462:       /* ninth column of a */
1463:       w0 = _mm256_set1_pd(work[8]);
1464:       a0 = _mm256_loadu_pd(v + 96);
1465:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1466:       a1 = _mm256_loadu_pd(v + 100);
1467:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1468:       a2 = _mm256_loadu_pd(v + 104);
1469:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1471:       /* tenth column of a */
1472:       w1 = _mm256_set1_pd(work[9]);
1473:       a3 = _mm256_loadu_pd(v + 108);
1474:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1475:       a4 = _mm256_loadu_pd(v + 112);
1476:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1477:       a5 = _mm256_loadu_pd(v + 116);
1478:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1480:       /* eleventh column of a */
1481:       w2 = _mm256_set1_pd(work[10]);
1482:       a0 = _mm256_loadu_pd(v + 120);
1483:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1484:       a1 = _mm256_loadu_pd(v + 124);
1485:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1486:       a2 = _mm256_loadu_pd(v + 128);
1487:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1489:       /* twelveth column of a */
1490:       w3 = _mm256_set1_pd(work[11]);
1491:       a3 = _mm256_loadu_pd(v + 132);
1492:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1493:       a4 = _mm256_loadu_pd(v + 136);
1494:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1495:       a5 = _mm256_loadu_pd(v + 140);
1496:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1498:       v += bs2;
1499:     }
1500:     if (usecprow) z = zarray + bs * ridx[i];
1501:     _mm256_storeu_pd(&z[0], z0);
1502:     _mm256_storeu_pd(&z[4], z1);
1503:     _mm256_storeu_pd(&z[8], z2);
1504:     if (!usecprow) z += bs;
1505:   }
1506:   PetscCall(VecRestoreArrayRead(xx, &x));
1507:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1508:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1509:   PetscFunctionReturn(PETSC_SUCCESS);
1510: }
1511: #endif

1513: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1514: /* Default MatMult for block size 15 */
1515: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1516: {
1517:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1518:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1519:   const PetscScalar *x, *xb;
1520:   PetscScalar       *zarray, xv;
1521:   const MatScalar   *v;
1522:   const PetscInt    *ii, *ij = a->j, *idx;
1523:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1524:   PetscBool          usecprow = a->compressedrow.use;

1526:   PetscFunctionBegin;
1527:   PetscCall(VecGetArrayRead(xx, &x));
1528:   PetscCall(VecGetArrayWrite(zz, &zarray));

1530:   v = a->a;
1531:   if (usecprow) {
1532:     mbs  = a->compressedrow.nrows;
1533:     ii   = a->compressedrow.i;
1534:     ridx = a->compressedrow.rindex;
1535:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1536:   } else {
1537:     mbs = a->mbs;
1538:     ii  = a->i;
1539:     z   = zarray;
1540:   }

1542:   for (i = 0; i < mbs; i++) {
1543:     n     = ii[i + 1] - ii[i];
1544:     idx   = ij + ii[i];
1545:     sum1  = 0.0;
1546:     sum2  = 0.0;
1547:     sum3  = 0.0;
1548:     sum4  = 0.0;
1549:     sum5  = 0.0;
1550:     sum6  = 0.0;
1551:     sum7  = 0.0;
1552:     sum8  = 0.0;
1553:     sum9  = 0.0;
1554:     sum10 = 0.0;
1555:     sum11 = 0.0;
1556:     sum12 = 0.0;
1557:     sum13 = 0.0;
1558:     sum14 = 0.0;
1559:     sum15 = 0.0;

1561:     for (j = 0; j < n; j++) {
1562:       xb = x + 15 * (idx[j]);

1564:       for (k = 0; k < 15; k++) {
1565:         xv = xb[k];
1566:         sum1 += v[0] * xv;
1567:         sum2 += v[1] * xv;
1568:         sum3 += v[2] * xv;
1569:         sum4 += v[3] * xv;
1570:         sum5 += v[4] * xv;
1571:         sum6 += v[5] * xv;
1572:         sum7 += v[6] * xv;
1573:         sum8 += v[7] * xv;
1574:         sum9 += v[8] * xv;
1575:         sum10 += v[9] * xv;
1576:         sum11 += v[10] * xv;
1577:         sum12 += v[11] * xv;
1578:         sum13 += v[12] * xv;
1579:         sum14 += v[13] * xv;
1580:         sum15 += v[14] * xv;
1581:         v += 15;
1582:       }
1583:     }
1584:     if (usecprow) z = zarray + 15 * ridx[i];
1585:     z[0]  = sum1;
1586:     z[1]  = sum2;
1587:     z[2]  = sum3;
1588:     z[3]  = sum4;
1589:     z[4]  = sum5;
1590:     z[5]  = sum6;
1591:     z[6]  = sum7;
1592:     z[7]  = sum8;
1593:     z[8]  = sum9;
1594:     z[9]  = sum10;
1595:     z[10] = sum11;
1596:     z[11] = sum12;
1597:     z[12] = sum13;
1598:     z[13] = sum14;
1599:     z[14] = sum15;

1601:     if (!usecprow) z += 15;
1602:   }

1604:   PetscCall(VecRestoreArrayRead(xx, &x));
1605:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1606:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1607:   PetscFunctionReturn(PETSC_SUCCESS);
1608: }

1610: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1611: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1612: {
1613:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1614:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1615:   const PetscScalar *x, *xb;
1616:   PetscScalar        x1, x2, x3, x4, *zarray;
1617:   const MatScalar   *v;
1618:   const PetscInt    *ii, *ij = a->j, *idx;
1619:   PetscInt           mbs, i, j, n, *ridx = NULL;
1620:   PetscBool          usecprow = a->compressedrow.use;

1622:   PetscFunctionBegin;
1623:   PetscCall(VecGetArrayRead(xx, &x));
1624:   PetscCall(VecGetArrayWrite(zz, &zarray));

1626:   v = a->a;
1627:   if (usecprow) {
1628:     mbs  = a->compressedrow.nrows;
1629:     ii   = a->compressedrow.i;
1630:     ridx = a->compressedrow.rindex;
1631:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1632:   } else {
1633:     mbs = a->mbs;
1634:     ii  = a->i;
1635:     z   = zarray;
1636:   }

1638:   for (i = 0; i < mbs; i++) {
1639:     n     = ii[i + 1] - ii[i];
1640:     idx   = ij + ii[i];
1641:     sum1  = 0.0;
1642:     sum2  = 0.0;
1643:     sum3  = 0.0;
1644:     sum4  = 0.0;
1645:     sum5  = 0.0;
1646:     sum6  = 0.0;
1647:     sum7  = 0.0;
1648:     sum8  = 0.0;
1649:     sum9  = 0.0;
1650:     sum10 = 0.0;
1651:     sum11 = 0.0;
1652:     sum12 = 0.0;
1653:     sum13 = 0.0;
1654:     sum14 = 0.0;
1655:     sum15 = 0.0;

1657:     for (j = 0; j < n; j++) {
1658:       xb = x + 15 * (idx[j]);
1659:       x1 = xb[0];
1660:       x2 = xb[1];
1661:       x3 = xb[2];
1662:       x4 = xb[3];

1664:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1665:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1666:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1667:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1668:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1669:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1670:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1671:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1672:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1673:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1674:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1675:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1676:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1677:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1678:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;

1680:       v += 60;

1682:       x1 = xb[4];
1683:       x2 = xb[5];
1684:       x3 = xb[6];
1685:       x4 = xb[7];

1687:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1688:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1689:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1690:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1691:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1692:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1693:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1694:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1695:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1696:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1697:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1698:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1699:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1700:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1701:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1702:       v += 60;

1704:       x1 = xb[8];
1705:       x2 = xb[9];
1706:       x3 = xb[10];
1707:       x4 = xb[11];
1708:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1709:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1710:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1711:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1712:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1713:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1714:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1715:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1716:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1717:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1718:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1719:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1720:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1721:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1722:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1723:       v += 60;

1725:       x1 = xb[12];
1726:       x2 = xb[13];
1727:       x3 = xb[14];
1728:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1729:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1730:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1731:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1732:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1733:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1734:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1735:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1736:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1737:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1738:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1739:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1740:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1741:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1742:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1743:       v += 45;
1744:     }
1745:     if (usecprow) z = zarray + 15 * ridx[i];
1746:     z[0]  = sum1;
1747:     z[1]  = sum2;
1748:     z[2]  = sum3;
1749:     z[3]  = sum4;
1750:     z[4]  = sum5;
1751:     z[5]  = sum6;
1752:     z[6]  = sum7;
1753:     z[7]  = sum8;
1754:     z[8]  = sum9;
1755:     z[9]  = sum10;
1756:     z[10] = sum11;
1757:     z[11] = sum12;
1758:     z[12] = sum13;
1759:     z[13] = sum14;
1760:     z[14] = sum15;

1762:     if (!usecprow) z += 15;
1763:   }

1765:   PetscCall(VecRestoreArrayRead(xx, &x));
1766:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1767:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1768:   PetscFunctionReturn(PETSC_SUCCESS);
1769: }

1771: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1772: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1773: {
1774:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1775:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1776:   const PetscScalar *x, *xb;
1777:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1778:   const MatScalar   *v;
1779:   const PetscInt    *ii, *ij = a->j, *idx;
1780:   PetscInt           mbs, i, j, n, *ridx = NULL;
1781:   PetscBool          usecprow = a->compressedrow.use;

1783:   PetscFunctionBegin;
1784:   PetscCall(VecGetArrayRead(xx, &x));
1785:   PetscCall(VecGetArrayWrite(zz, &zarray));

1787:   v = a->a;
1788:   if (usecprow) {
1789:     mbs  = a->compressedrow.nrows;
1790:     ii   = a->compressedrow.i;
1791:     ridx = a->compressedrow.rindex;
1792:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1793:   } else {
1794:     mbs = a->mbs;
1795:     ii  = a->i;
1796:     z   = zarray;
1797:   }

1799:   for (i = 0; i < mbs; i++) {
1800:     n     = ii[i + 1] - ii[i];
1801:     idx   = ij + ii[i];
1802:     sum1  = 0.0;
1803:     sum2  = 0.0;
1804:     sum3  = 0.0;
1805:     sum4  = 0.0;
1806:     sum5  = 0.0;
1807:     sum6  = 0.0;
1808:     sum7  = 0.0;
1809:     sum8  = 0.0;
1810:     sum9  = 0.0;
1811:     sum10 = 0.0;
1812:     sum11 = 0.0;
1813:     sum12 = 0.0;
1814:     sum13 = 0.0;
1815:     sum14 = 0.0;
1816:     sum15 = 0.0;

1818:     for (j = 0; j < n; j++) {
1819:       xb = x + 15 * (idx[j]);
1820:       x1 = xb[0];
1821:       x2 = xb[1];
1822:       x3 = xb[2];
1823:       x4 = xb[3];
1824:       x5 = xb[4];
1825:       x6 = xb[5];
1826:       x7 = xb[6];
1827:       x8 = xb[7];

1829:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1830:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1831:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1832:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1833:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1834:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1835:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1836:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1837:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1838:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1839:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1840:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1841:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1842:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1843:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1844:       v += 120;

1846:       x1 = xb[8];
1847:       x2 = xb[9];
1848:       x3 = xb[10];
1849:       x4 = xb[11];
1850:       x5 = xb[12];
1851:       x6 = xb[13];
1852:       x7 = xb[14];

1854:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1855:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1856:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1857:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1858:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1859:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1860:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1861:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1862:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1863:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1864:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1865:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1866:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1867:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1868:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1869:       v += 105;
1870:     }
1871:     if (usecprow) z = zarray + 15 * ridx[i];
1872:     z[0]  = sum1;
1873:     z[1]  = sum2;
1874:     z[2]  = sum3;
1875:     z[3]  = sum4;
1876:     z[4]  = sum5;
1877:     z[5]  = sum6;
1878:     z[6]  = sum7;
1879:     z[7]  = sum8;
1880:     z[8]  = sum9;
1881:     z[9]  = sum10;
1882:     z[10] = sum11;
1883:     z[11] = sum12;
1884:     z[12] = sum13;
1885:     z[13] = sum14;
1886:     z[14] = sum15;

1888:     if (!usecprow) z += 15;
1889:   }

1891:   PetscCall(VecRestoreArrayRead(xx, &x));
1892:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1893:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1894:   PetscFunctionReturn(PETSC_SUCCESS);
1895: }

1897: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1898: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1899: {
1900:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1901:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1902:   const PetscScalar *x, *xb;
1903:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1904:   const MatScalar   *v;
1905:   const PetscInt    *ii, *ij = a->j, *idx;
1906:   PetscInt           mbs, i, j, n, *ridx = NULL;
1907:   PetscBool          usecprow = a->compressedrow.use;

1909:   PetscFunctionBegin;
1910:   PetscCall(VecGetArrayRead(xx, &x));
1911:   PetscCall(VecGetArrayWrite(zz, &zarray));

1913:   v = a->a;
1914:   if (usecprow) {
1915:     mbs  = a->compressedrow.nrows;
1916:     ii   = a->compressedrow.i;
1917:     ridx = a->compressedrow.rindex;
1918:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1919:   } else {
1920:     mbs = a->mbs;
1921:     ii  = a->i;
1922:     z   = zarray;
1923:   }

1925:   for (i = 0; i < mbs; i++) {
1926:     n     = ii[i + 1] - ii[i];
1927:     idx   = ij + ii[i];
1928:     sum1  = 0.0;
1929:     sum2  = 0.0;
1930:     sum3  = 0.0;
1931:     sum4  = 0.0;
1932:     sum5  = 0.0;
1933:     sum6  = 0.0;
1934:     sum7  = 0.0;
1935:     sum8  = 0.0;
1936:     sum9  = 0.0;
1937:     sum10 = 0.0;
1938:     sum11 = 0.0;
1939:     sum12 = 0.0;
1940:     sum13 = 0.0;
1941:     sum14 = 0.0;
1942:     sum15 = 0.0;

1944:     for (j = 0; j < n; j++) {
1945:       xb  = x + 15 * (idx[j]);
1946:       x1  = xb[0];
1947:       x2  = xb[1];
1948:       x3  = xb[2];
1949:       x4  = xb[3];
1950:       x5  = xb[4];
1951:       x6  = xb[5];
1952:       x7  = xb[6];
1953:       x8  = xb[7];
1954:       x9  = xb[8];
1955:       x10 = xb[9];
1956:       x11 = xb[10];
1957:       x12 = xb[11];
1958:       x13 = xb[12];
1959:       x14 = xb[13];
1960:       x15 = xb[14];

1962:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1963:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1964:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1965:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1966:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1967:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1968:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1969:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1970:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1971:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1972:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1973:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1974:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1975:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1976:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1977:       v += 225;
1978:     }
1979:     if (usecprow) z = zarray + 15 * ridx[i];
1980:     z[0]  = sum1;
1981:     z[1]  = sum2;
1982:     z[2]  = sum3;
1983:     z[3]  = sum4;
1984:     z[4]  = sum5;
1985:     z[5]  = sum6;
1986:     z[6]  = sum7;
1987:     z[7]  = sum8;
1988:     z[8]  = sum9;
1989:     z[9]  = sum10;
1990:     z[10] = sum11;
1991:     z[11] = sum12;
1992:     z[12] = sum13;
1993:     z[13] = sum14;
1994:     z[14] = sum15;

1996:     if (!usecprow) z += 15;
1997:   }

1999:   PetscCall(VecRestoreArrayRead(xx, &x));
2000:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2001:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
2002:   PetscFunctionReturn(PETSC_SUCCESS);
2003: }

2005: /*
2006:     This will not work with MatScalar == float because it calls the BLAS
2007: */
2008: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
2009: {
2010:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2011:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2012:   const PetscScalar *x, *xb;
2013:   const MatScalar   *v;
2014:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2015:   const PetscInt    *idx, *ii, *ridx = NULL;
2016:   PetscInt           ncols, k;
2017:   PetscBool          usecprow = a->compressedrow.use;

2019:   PetscFunctionBegin;
2020:   PetscCall(VecGetArrayRead(xx, &x));
2021:   PetscCall(VecGetArrayWrite(zz, &zarray));

2023:   idx = a->j;
2024:   v   = a->a;
2025:   if (usecprow) {
2026:     mbs  = a->compressedrow.nrows;
2027:     ii   = a->compressedrow.i;
2028:     ridx = a->compressedrow.rindex;
2029:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2030:   } else {
2031:     mbs = a->mbs;
2032:     ii  = a->i;
2033:     z   = zarray;
2034:   }

2036:   if (!a->mult_work) {
2037:     k = PetscMax(A->rmap->n, A->cmap->n);
2038:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2039:   }
2040:   work = a->mult_work;
2041:   for (i = 0; i < mbs; i++) {
2042:     n = ii[1] - ii[0];
2043:     ii++;
2044:     ncols = n * bs;
2045:     workt = work;
2046:     for (j = 0; j < n; j++) {
2047:       xb = x + bs * (*idx++);
2048:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2049:       workt += bs;
2050:     }
2051:     if (usecprow) z = zarray + bs * ridx[i];
2052:     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2053:     v += n * bs2;
2054:     if (!usecprow) z += bs;
2055:   }
2056:   PetscCall(VecRestoreArrayRead(xx, &x));
2057:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2058:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2059:   PetscFunctionReturn(PETSC_SUCCESS);
2060: }

2062: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2063: {
2064:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2065:   const PetscScalar *x;
2066:   PetscScalar       *y, *z, sum;
2067:   const MatScalar   *v;
2068:   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2069:   const PetscInt    *idx, *ii;
2070:   PetscBool          usecprow = a->compressedrow.use;

2072:   PetscFunctionBegin;
2073:   PetscCall(VecGetArrayRead(xx, &x));
2074:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));

2076:   idx = a->j;
2077:   v   = a->a;
2078:   if (usecprow) {
2079:     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2080:     mbs  = a->compressedrow.nrows;
2081:     ii   = a->compressedrow.i;
2082:     ridx = a->compressedrow.rindex;
2083:   } else {
2084:     ii = a->i;
2085:   }

2087:   for (i = 0; i < mbs; i++) {
2088:     n = ii[1] - ii[0];
2089:     ii++;
2090:     if (!usecprow) {
2091:       sum = y[i];
2092:     } else {
2093:       sum = y[ridx[i]];
2094:     }
2095:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2096:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2097:     PetscSparseDensePlusDot(sum, x, v, idx, n);
2098:     v += n;
2099:     idx += n;
2100:     if (usecprow) {
2101:       z[ridx[i]] = sum;
2102:     } else {
2103:       z[i] = sum;
2104:     }
2105:   }
2106:   PetscCall(VecRestoreArrayRead(xx, &x));
2107:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2108:   PetscCall(PetscLogFlops(2.0 * a->nz));
2109:   PetscFunctionReturn(PETSC_SUCCESS);
2110: }

2112: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2113: {
2114:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2115:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2116:   const PetscScalar *x, *xb;
2117:   PetscScalar        x1, x2, *yarray, *zarray;
2118:   const MatScalar   *v;
2119:   PetscInt           mbs = a->mbs, i, n, j;
2120:   const PetscInt    *idx, *ii, *ridx = NULL;
2121:   PetscBool          usecprow = a->compressedrow.use;

2123:   PetscFunctionBegin;
2124:   PetscCall(VecGetArrayRead(xx, &x));
2125:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2127:   idx = a->j;
2128:   v   = a->a;
2129:   if (usecprow) {
2130:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2131:     mbs  = a->compressedrow.nrows;
2132:     ii   = a->compressedrow.i;
2133:     ridx = a->compressedrow.rindex;
2134:   } else {
2135:     ii = a->i;
2136:     y  = yarray;
2137:     z  = zarray;
2138:   }

2140:   for (i = 0; i < mbs; i++) {
2141:     n = ii[1] - ii[0];
2142:     ii++;
2143:     if (usecprow) {
2144:       z = zarray + 2 * ridx[i];
2145:       y = yarray + 2 * ridx[i];
2146:     }
2147:     sum1 = y[0];
2148:     sum2 = y[1];
2149:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2150:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2151:     for (j = 0; j < n; j++) {
2152:       xb = x + 2 * (*idx++);
2153:       x1 = xb[0];
2154:       x2 = xb[1];

2156:       sum1 += v[0] * x1 + v[2] * x2;
2157:       sum2 += v[1] * x1 + v[3] * x2;
2158:       v += 4;
2159:     }
2160:     z[0] = sum1;
2161:     z[1] = sum2;
2162:     if (!usecprow) {
2163:       z += 2;
2164:       y += 2;
2165:     }
2166:   }
2167:   PetscCall(VecRestoreArrayRead(xx, &x));
2168:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2169:   PetscCall(PetscLogFlops(4.0 * a->nz));
2170:   PetscFunctionReturn(PETSC_SUCCESS);
2171: }

2173: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2174: {
2175:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2176:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2177:   const PetscScalar *x, *xb;
2178:   const MatScalar   *v;
2179:   PetscInt           mbs = a->mbs, i, j, n;
2180:   const PetscInt    *idx, *ii, *ridx = NULL;
2181:   PetscBool          usecprow = a->compressedrow.use;

2183:   PetscFunctionBegin;
2184:   PetscCall(VecGetArrayRead(xx, &x));
2185:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2187:   idx = a->j;
2188:   v   = a->a;
2189:   if (usecprow) {
2190:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2191:     mbs  = a->compressedrow.nrows;
2192:     ii   = a->compressedrow.i;
2193:     ridx = a->compressedrow.rindex;
2194:   } else {
2195:     ii = a->i;
2196:     y  = yarray;
2197:     z  = zarray;
2198:   }

2200:   for (i = 0; i < mbs; i++) {
2201:     n = ii[1] - ii[0];
2202:     ii++;
2203:     if (usecprow) {
2204:       z = zarray + 3 * ridx[i];
2205:       y = yarray + 3 * ridx[i];
2206:     }
2207:     sum1 = y[0];
2208:     sum2 = y[1];
2209:     sum3 = y[2];
2210:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2211:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2212:     for (j = 0; j < n; j++) {
2213:       xb = x + 3 * (*idx++);
2214:       x1 = xb[0];
2215:       x2 = xb[1];
2216:       x3 = xb[2];
2217:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2218:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2219:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2220:       v += 9;
2221:     }
2222:     z[0] = sum1;
2223:     z[1] = sum2;
2224:     z[2] = sum3;
2225:     if (!usecprow) {
2226:       z += 3;
2227:       y += 3;
2228:     }
2229:   }
2230:   PetscCall(VecRestoreArrayRead(xx, &x));
2231:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2232:   PetscCall(PetscLogFlops(18.0 * a->nz));
2233:   PetscFunctionReturn(PETSC_SUCCESS);
2234: }

2236: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2237: {
2238:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2239:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2240:   const PetscScalar *x, *xb;
2241:   const MatScalar   *v;
2242:   PetscInt           mbs = a->mbs, i, j, n;
2243:   const PetscInt    *idx, *ii, *ridx = NULL;
2244:   PetscBool          usecprow = a->compressedrow.use;

2246:   PetscFunctionBegin;
2247:   PetscCall(VecGetArrayRead(xx, &x));
2248:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2250:   idx = a->j;
2251:   v   = a->a;
2252:   if (usecprow) {
2253:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2254:     mbs  = a->compressedrow.nrows;
2255:     ii   = a->compressedrow.i;
2256:     ridx = a->compressedrow.rindex;
2257:   } else {
2258:     ii = a->i;
2259:     y  = yarray;
2260:     z  = zarray;
2261:   }

2263:   for (i = 0; i < mbs; i++) {
2264:     n = ii[1] - ii[0];
2265:     ii++;
2266:     if (usecprow) {
2267:       z = zarray + 4 * ridx[i];
2268:       y = yarray + 4 * ridx[i];
2269:     }
2270:     sum1 = y[0];
2271:     sum2 = y[1];
2272:     sum3 = y[2];
2273:     sum4 = y[3];
2274:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2275:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2276:     for (j = 0; j < n; j++) {
2277:       xb = x + 4 * (*idx++);
2278:       x1 = xb[0];
2279:       x2 = xb[1];
2280:       x3 = xb[2];
2281:       x4 = xb[3];
2282:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2283:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2284:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2285:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2286:       v += 16;
2287:     }
2288:     z[0] = sum1;
2289:     z[1] = sum2;
2290:     z[2] = sum3;
2291:     z[3] = sum4;
2292:     if (!usecprow) {
2293:       z += 4;
2294:       y += 4;
2295:     }
2296:   }
2297:   PetscCall(VecRestoreArrayRead(xx, &x));
2298:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2299:   PetscCall(PetscLogFlops(32.0 * a->nz));
2300:   PetscFunctionReturn(PETSC_SUCCESS);
2301: }

2303: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2304: {
2305:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2306:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2307:   const PetscScalar *x, *xb;
2308:   PetscScalar       *yarray, *zarray;
2309:   const MatScalar   *v;
2310:   PetscInt           mbs = a->mbs, i, j, n;
2311:   const PetscInt    *idx, *ii, *ridx = NULL;
2312:   PetscBool          usecprow = a->compressedrow.use;

2314:   PetscFunctionBegin;
2315:   PetscCall(VecGetArrayRead(xx, &x));
2316:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2318:   idx = a->j;
2319:   v   = a->a;
2320:   if (usecprow) {
2321:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2322:     mbs  = a->compressedrow.nrows;
2323:     ii   = a->compressedrow.i;
2324:     ridx = a->compressedrow.rindex;
2325:   } else {
2326:     ii = a->i;
2327:     y  = yarray;
2328:     z  = zarray;
2329:   }

2331:   for (i = 0; i < mbs; i++) {
2332:     n = ii[1] - ii[0];
2333:     ii++;
2334:     if (usecprow) {
2335:       z = zarray + 5 * ridx[i];
2336:       y = yarray + 5 * ridx[i];
2337:     }
2338:     sum1 = y[0];
2339:     sum2 = y[1];
2340:     sum3 = y[2];
2341:     sum4 = y[3];
2342:     sum5 = y[4];
2343:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2344:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2345:     for (j = 0; j < n; j++) {
2346:       xb = x + 5 * (*idx++);
2347:       x1 = xb[0];
2348:       x2 = xb[1];
2349:       x3 = xb[2];
2350:       x4 = xb[3];
2351:       x5 = xb[4];
2352:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2353:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2354:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2355:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2356:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2357:       v += 25;
2358:     }
2359:     z[0] = sum1;
2360:     z[1] = sum2;
2361:     z[2] = sum3;
2362:     z[3] = sum4;
2363:     z[4] = sum5;
2364:     if (!usecprow) {
2365:       z += 5;
2366:       y += 5;
2367:     }
2368:   }
2369:   PetscCall(VecRestoreArrayRead(xx, &x));
2370:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2371:   PetscCall(PetscLogFlops(50.0 * a->nz));
2372:   PetscFunctionReturn(PETSC_SUCCESS);
2373: }

2375: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2376: {
2377:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2378:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2379:   const PetscScalar *x, *xb;
2380:   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2381:   const MatScalar   *v;
2382:   PetscInt           mbs = a->mbs, i, j, n;
2383:   const PetscInt    *idx, *ii, *ridx = NULL;
2384:   PetscBool          usecprow = a->compressedrow.use;

2386:   PetscFunctionBegin;
2387:   PetscCall(VecGetArrayRead(xx, &x));
2388:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2390:   idx = a->j;
2391:   v   = a->a;
2392:   if (usecprow) {
2393:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2394:     mbs  = a->compressedrow.nrows;
2395:     ii   = a->compressedrow.i;
2396:     ridx = a->compressedrow.rindex;
2397:   } else {
2398:     ii = a->i;
2399:     y  = yarray;
2400:     z  = zarray;
2401:   }

2403:   for (i = 0; i < mbs; i++) {
2404:     n = ii[1] - ii[0];
2405:     ii++;
2406:     if (usecprow) {
2407:       z = zarray + 6 * ridx[i];
2408:       y = yarray + 6 * ridx[i];
2409:     }
2410:     sum1 = y[0];
2411:     sum2 = y[1];
2412:     sum3 = y[2];
2413:     sum4 = y[3];
2414:     sum5 = y[4];
2415:     sum6 = y[5];
2416:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2417:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2418:     for (j = 0; j < n; j++) {
2419:       xb = x + 6 * (*idx++);
2420:       x1 = xb[0];
2421:       x2 = xb[1];
2422:       x3 = xb[2];
2423:       x4 = xb[3];
2424:       x5 = xb[4];
2425:       x6 = xb[5];
2426:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2427:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2428:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2429:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2430:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2431:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2432:       v += 36;
2433:     }
2434:     z[0] = sum1;
2435:     z[1] = sum2;
2436:     z[2] = sum3;
2437:     z[3] = sum4;
2438:     z[4] = sum5;
2439:     z[5] = sum6;
2440:     if (!usecprow) {
2441:       z += 6;
2442:       y += 6;
2443:     }
2444:   }
2445:   PetscCall(VecRestoreArrayRead(xx, &x));
2446:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2447:   PetscCall(PetscLogFlops(72.0 * a->nz));
2448:   PetscFunctionReturn(PETSC_SUCCESS);
2449: }

2451: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2452: {
2453:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2455:   const PetscScalar *x, *xb;
2456:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2457:   const MatScalar   *v;
2458:   PetscInt           mbs = a->mbs, i, j, n;
2459:   const PetscInt    *idx, *ii, *ridx = NULL;
2460:   PetscBool          usecprow = a->compressedrow.use;

2462:   PetscFunctionBegin;
2463:   PetscCall(VecGetArrayRead(xx, &x));
2464:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2466:   idx = a->j;
2467:   v   = a->a;
2468:   if (usecprow) {
2469:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2470:     mbs  = a->compressedrow.nrows;
2471:     ii   = a->compressedrow.i;
2472:     ridx = a->compressedrow.rindex;
2473:   } else {
2474:     ii = a->i;
2475:     y  = yarray;
2476:     z  = zarray;
2477:   }

2479:   for (i = 0; i < mbs; i++) {
2480:     n = ii[1] - ii[0];
2481:     ii++;
2482:     if (usecprow) {
2483:       z = zarray + 7 * ridx[i];
2484:       y = yarray + 7 * ridx[i];
2485:     }
2486:     sum1 = y[0];
2487:     sum2 = y[1];
2488:     sum3 = y[2];
2489:     sum4 = y[3];
2490:     sum5 = y[4];
2491:     sum6 = y[5];
2492:     sum7 = y[6];
2493:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2494:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2495:     for (j = 0; j < n; j++) {
2496:       xb = x + 7 * (*idx++);
2497:       x1 = xb[0];
2498:       x2 = xb[1];
2499:       x3 = xb[2];
2500:       x4 = xb[3];
2501:       x5 = xb[4];
2502:       x6 = xb[5];
2503:       x7 = xb[6];
2504:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2505:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2506:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2507:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2508:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2509:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2510:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2511:       v += 49;
2512:     }
2513:     z[0] = sum1;
2514:     z[1] = sum2;
2515:     z[2] = sum3;
2516:     z[3] = sum4;
2517:     z[4] = sum5;
2518:     z[5] = sum6;
2519:     z[6] = sum7;
2520:     if (!usecprow) {
2521:       z += 7;
2522:       y += 7;
2523:     }
2524:   }
2525:   PetscCall(VecRestoreArrayRead(xx, &x));
2526:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2527:   PetscCall(PetscLogFlops(98.0 * a->nz));
2528:   PetscFunctionReturn(PETSC_SUCCESS);
2529: }

2531: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2532: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2533: {
2534:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2535:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2536:   const PetscScalar *x, *xb;
2537:   const MatScalar   *v;
2538:   PetscInt           mbs, i, j, n;
2539:   PetscInt           k;
2540:   PetscBool          usecprow = a->compressedrow.use;
2541:   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;

2543:   __m256d a0, a1, a2, a3, a4, a5;
2544:   __m256d w0, w1, w2, w3;
2545:   __m256d z0, z1, z2;
2546:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

2548:   PetscFunctionBegin;
2549:   PetscCall(VecCopy(yy, zz));
2550:   PetscCall(VecGetArrayRead(xx, &x));
2551:   PetscCall(VecGetArray(zz, &zarray));

2553:   idx = a->j;
2554:   v   = a->a;
2555:   if (usecprow) {
2556:     mbs  = a->compressedrow.nrows;
2557:     ii   = a->compressedrow.i;
2558:     ridx = a->compressedrow.rindex;
2559:   } else {
2560:     mbs = a->mbs;
2561:     ii  = a->i;
2562:     z   = zarray;
2563:   }

2565:   if (!a->mult_work) {
2566:     k = PetscMax(A->rmap->n, A->cmap->n);
2567:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2568:   }

2570:   work = a->mult_work;
2571:   for (i = 0; i < mbs; i++) {
2572:     n = ii[1] - ii[0];
2573:     ii++;
2574:     workt = work;
2575:     for (j = 0; j < n; j++) {
2576:       xb = x + bs * (*idx++);
2577:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2578:       workt += bs;
2579:     }
2580:     if (usecprow) z = zarray + bs * ridx[i];

2582:     z0 = _mm256_loadu_pd(&z[0]);
2583:     z1 = _mm256_loadu_pd(&z[4]);
2584:     z2 = _mm256_set1_pd(z[8]);

2586:     for (j = 0; j < n; j++) {
2587:       /* first column of a */
2588:       w0 = _mm256_set1_pd(work[j * 9]);
2589:       a0 = _mm256_loadu_pd(&v[j * 81]);
2590:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2591:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2592:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2593:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2594:       z2 = _mm256_fmadd_pd(a2, w0, z2);

2596:       /* second column of a */
2597:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2598:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2599:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2600:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2601:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2602:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2603:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2605:       /* third column of a */
2606:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2607:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2608:       z0 = _mm256_fmadd_pd(a3, w2, z0);
2609:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2610:       z1 = _mm256_fmadd_pd(a4, w2, z1);
2611:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2612:       z2 = _mm256_fmadd_pd(a5, w2, z2);

2614:       /* fourth column of a */
2615:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2616:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2617:       z0 = _mm256_fmadd_pd(a0, w3, z0);
2618:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2619:       z1 = _mm256_fmadd_pd(a1, w3, z1);
2620:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2621:       z2 = _mm256_fmadd_pd(a2, w3, z2);

2623:       /* fifth column of a */
2624:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2625:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2626:       z0 = _mm256_fmadd_pd(a3, w0, z0);
2627:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2628:       z1 = _mm256_fmadd_pd(a4, w0, z1);
2629:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2630:       z2 = _mm256_fmadd_pd(a5, w0, z2);

2632:       /* sixth column of a */
2633:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2634:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2635:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2636:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2637:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2638:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2639:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2641:       /* seventh column of a */
2642:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2643:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2644:       z0 = _mm256_fmadd_pd(a0, w2, z0);
2645:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2646:       z1 = _mm256_fmadd_pd(a1, w2, z1);
2647:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2648:       z2 = _mm256_fmadd_pd(a2, w2, z2);

2650:       /* eighth column of a */
2651:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2652:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2653:       z0 = _mm256_fmadd_pd(a3, w3, z0);
2654:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2655:       z1 = _mm256_fmadd_pd(a4, w3, z1);
2656:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2657:       z2 = _mm256_fmadd_pd(a5, w3, z2);

2659:       /* ninth column of a */
2660:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2661:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2662:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2663:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2664:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2665:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2666:       z2 = _mm256_fmadd_pd(a2, w0, z2);
2667:     }

2669:     _mm256_storeu_pd(&z[0], z0);
2670:     _mm256_storeu_pd(&z[4], z1);
2671:     _mm256_maskstore_pd(&z[8], mask1, z2);

2673:     v += n * bs2;
2674:     if (!usecprow) z += bs;
2675:   }
2676:   PetscCall(VecRestoreArrayRead(xx, &x));
2677:   PetscCall(VecRestoreArray(zz, &zarray));
2678:   PetscCall(PetscLogFlops(162.0 * a->nz));
2679:   PetscFunctionReturn(PETSC_SUCCESS);
2680: }
2681: #endif

2683: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2684: {
2685:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2686:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2687:   const PetscScalar *x, *xb;
2688:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2689:   const MatScalar   *v;
2690:   PetscInt           mbs = a->mbs, i, j, n;
2691:   const PetscInt    *idx, *ii, *ridx = NULL;
2692:   PetscBool          usecprow = a->compressedrow.use;

2694:   PetscFunctionBegin;
2695:   PetscCall(VecGetArrayRead(xx, &x));
2696:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2698:   idx = a->j;
2699:   v   = a->a;
2700:   if (usecprow) {
2701:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2702:     mbs  = a->compressedrow.nrows;
2703:     ii   = a->compressedrow.i;
2704:     ridx = a->compressedrow.rindex;
2705:   } else {
2706:     ii = a->i;
2707:     y  = yarray;
2708:     z  = zarray;
2709:   }

2711:   for (i = 0; i < mbs; i++) {
2712:     n = ii[1] - ii[0];
2713:     ii++;
2714:     if (usecprow) {
2715:       z = zarray + 11 * ridx[i];
2716:       y = yarray + 11 * ridx[i];
2717:     }
2718:     sum1  = y[0];
2719:     sum2  = y[1];
2720:     sum3  = y[2];
2721:     sum4  = y[3];
2722:     sum5  = y[4];
2723:     sum6  = y[5];
2724:     sum7  = y[6];
2725:     sum8  = y[7];
2726:     sum9  = y[8];
2727:     sum10 = y[9];
2728:     sum11 = y[10];
2729:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2730:     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2731:     for (j = 0; j < n; j++) {
2732:       xb  = x + 11 * (*idx++);
2733:       x1  = xb[0];
2734:       x2  = xb[1];
2735:       x3  = xb[2];
2736:       x4  = xb[3];
2737:       x5  = xb[4];
2738:       x6  = xb[5];
2739:       x7  = xb[6];
2740:       x8  = xb[7];
2741:       x9  = xb[8];
2742:       x10 = xb[9];
2743:       x11 = xb[10];
2744:       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2745:       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2746:       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2747:       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2748:       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2749:       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2750:       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2751:       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2752:       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2753:       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2754:       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2755:       v += 121;
2756:     }
2757:     z[0]  = sum1;
2758:     z[1]  = sum2;
2759:     z[2]  = sum3;
2760:     z[3]  = sum4;
2761:     z[4]  = sum5;
2762:     z[5]  = sum6;
2763:     z[6]  = sum7;
2764:     z[7]  = sum8;
2765:     z[8]  = sum9;
2766:     z[9]  = sum10;
2767:     z[10] = sum11;
2768:     if (!usecprow) {
2769:       z += 11;
2770:       y += 11;
2771:     }
2772:   }
2773:   PetscCall(VecRestoreArrayRead(xx, &x));
2774:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2775:   PetscCall(PetscLogFlops(242.0 * a->nz));
2776:   PetscFunctionReturn(PETSC_SUCCESS);
2777: }

2779: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2780: {
2781:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2782:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2783:   const PetscScalar *x, *xb;
2784:   const MatScalar   *v;
2785:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2786:   PetscInt           ncols, k;
2787:   const PetscInt    *ridx     = NULL, *idx, *ii;
2788:   PetscBool          usecprow = a->compressedrow.use;

2790:   PetscFunctionBegin;
2791:   PetscCall(VecCopy(yy, zz));
2792:   PetscCall(VecGetArrayRead(xx, &x));
2793:   PetscCall(VecGetArray(zz, &zarray));

2795:   idx = a->j;
2796:   v   = a->a;
2797:   if (usecprow) {
2798:     mbs  = a->compressedrow.nrows;
2799:     ii   = a->compressedrow.i;
2800:     ridx = a->compressedrow.rindex;
2801:   } else {
2802:     mbs = a->mbs;
2803:     ii  = a->i;
2804:     z   = zarray;
2805:   }

2807:   if (!a->mult_work) {
2808:     k = PetscMax(A->rmap->n, A->cmap->n);
2809:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2810:   }
2811:   work = a->mult_work;
2812:   for (i = 0; i < mbs; i++) {
2813:     n = ii[1] - ii[0];
2814:     ii++;
2815:     ncols = n * bs;
2816:     workt = work;
2817:     for (j = 0; j < n; j++) {
2818:       xb = x + bs * (*idx++);
2819:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2820:       workt += bs;
2821:     }
2822:     if (usecprow) z = zarray + bs * ridx[i];
2823:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2824:     v += n * bs2;
2825:     if (!usecprow) z += bs;
2826:   }
2827:   PetscCall(VecRestoreArrayRead(xx, &x));
2828:   PetscCall(VecRestoreArray(zz, &zarray));
2829:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2830:   PetscFunctionReturn(PETSC_SUCCESS);
2831: }

2833: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2834: {
2835:   PetscScalar zero = 0.0;

2837:   PetscFunctionBegin;
2838:   PetscCall(VecSet(zz, zero));
2839:   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2840:   PetscFunctionReturn(PETSC_SUCCESS);
2841: }

2843: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2844: {
2845:   PetscScalar zero = 0.0;

2847:   PetscFunctionBegin;
2848:   PetscCall(VecSet(zz, zero));
2849:   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2850:   PetscFunctionReturn(PETSC_SUCCESS);
2851: }

2853: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2854: {
2855:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2856:   PetscScalar       *z, x1, x2, x3, x4, x5;
2857:   const PetscScalar *x, *xb = NULL;
2858:   const MatScalar   *v;
2859:   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2860:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2861:   Mat_CompressedRow  cprow    = a->compressedrow;
2862:   PetscBool          usecprow = cprow.use;

2864:   PetscFunctionBegin;
2865:   if (yy != zz) PetscCall(VecCopy(yy, zz));
2866:   PetscCall(VecGetArrayRead(xx, &x));
2867:   PetscCall(VecGetArray(zz, &z));

2869:   idx = a->j;
2870:   v   = a->a;
2871:   if (usecprow) {
2872:     mbs  = cprow.nrows;
2873:     ii   = cprow.i;
2874:     ridx = cprow.rindex;
2875:   } else {
2876:     mbs = a->mbs;
2877:     ii  = a->i;
2878:     xb  = x;
2879:   }

2881:   switch (bs) {
2882:   case 1:
2883:     for (i = 0; i < mbs; i++) {
2884:       if (usecprow) xb = x + ridx[i];
2885:       x1 = xb[0];
2886:       ib = idx + ii[0];
2887:       n  = ii[1] - ii[0];
2888:       ii++;
2889:       for (j = 0; j < n; j++) {
2890:         rval = ib[j];
2891:         z[rval] += PetscConj(*v) * x1;
2892:         v++;
2893:       }
2894:       if (!usecprow) xb++;
2895:     }
2896:     break;
2897:   case 2:
2898:     for (i = 0; i < mbs; i++) {
2899:       if (usecprow) xb = x + 2 * ridx[i];
2900:       x1 = xb[0];
2901:       x2 = xb[1];
2902:       ib = idx + ii[0];
2903:       n  = ii[1] - ii[0];
2904:       ii++;
2905:       for (j = 0; j < n; j++) {
2906:         rval = ib[j] * 2;
2907:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2908:         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2909:         v += 4;
2910:       }
2911:       if (!usecprow) xb += 2;
2912:     }
2913:     break;
2914:   case 3:
2915:     for (i = 0; i < mbs; i++) {
2916:       if (usecprow) xb = x + 3 * ridx[i];
2917:       x1 = xb[0];
2918:       x2 = xb[1];
2919:       x3 = xb[2];
2920:       ib = idx + ii[0];
2921:       n  = ii[1] - ii[0];
2922:       ii++;
2923:       for (j = 0; j < n; j++) {
2924:         rval = ib[j] * 3;
2925:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2926:         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2927:         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2928:         v += 9;
2929:       }
2930:       if (!usecprow) xb += 3;
2931:     }
2932:     break;
2933:   case 4:
2934:     for (i = 0; i < mbs; i++) {
2935:       if (usecprow) xb = x + 4 * ridx[i];
2936:       x1 = xb[0];
2937:       x2 = xb[1];
2938:       x3 = xb[2];
2939:       x4 = xb[3];
2940:       ib = idx + ii[0];
2941:       n  = ii[1] - ii[0];
2942:       ii++;
2943:       for (j = 0; j < n; j++) {
2944:         rval = ib[j] * 4;
2945:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2946:         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2947:         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2948:         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2949:         v += 16;
2950:       }
2951:       if (!usecprow) xb += 4;
2952:     }
2953:     break;
2954:   case 5:
2955:     for (i = 0; i < mbs; i++) {
2956:       if (usecprow) xb = x + 5 * ridx[i];
2957:       x1 = xb[0];
2958:       x2 = xb[1];
2959:       x3 = xb[2];
2960:       x4 = xb[3];
2961:       x5 = xb[4];
2962:       ib = idx + ii[0];
2963:       n  = ii[1] - ii[0];
2964:       ii++;
2965:       for (j = 0; j < n; j++) {
2966:         rval = ib[j] * 5;
2967:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2968:         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2969:         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2970:         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2971:         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2972:         v += 25;
2973:       }
2974:       if (!usecprow) xb += 5;
2975:     }
2976:     break;
2977:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2978:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2979: #if 0
2980:     {
2981:       PetscInt          ncols,k,bs2=a->bs2;
2982:       PetscScalar       *work,*workt,zb;
2983:       const PetscScalar *xtmp;
2984:       if (!a->mult_work) {
2985:         k    = PetscMax(A->rmap->n,A->cmap->n);
2986:         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2987:       }
2988:       work = a->mult_work;
2989:       xtmp = x;
2990:       for (i=0; i<mbs; i++) {
2991:         n     = ii[1] - ii[0]; ii++;
2992:         ncols = n*bs;
2993:         PetscCall(PetscArrayzero(work,ncols));
2994:         if (usecprow) xtmp = x + bs*ridx[i];
2995:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2996:         v += n*bs2;
2997:         if (!usecprow) xtmp += bs;
2998:         workt = work;
2999:         for (j=0; j<n; j++) {
3000:           zb = z + bs*(*idx++);
3001:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
3002:           workt += bs;
3003:         }
3004:       }
3005:     }
3006: #endif
3007:   }
3008:   PetscCall(VecRestoreArrayRead(xx, &x));
3009:   PetscCall(VecRestoreArray(zz, &z));
3010:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3011:   PetscFunctionReturn(PETSC_SUCCESS);
3012: }

3014: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
3015: {
3016:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3017:   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
3018:   const PetscScalar *x, *xb = NULL;
3019:   const MatScalar   *v;
3020:   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3021:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
3022:   Mat_CompressedRow  cprow    = a->compressedrow;
3023:   PetscBool          usecprow = cprow.use;

3025:   PetscFunctionBegin;
3026:   if (yy != zz) PetscCall(VecCopy(yy, zz));
3027:   PetscCall(VecGetArrayRead(xx, &x));
3028:   PetscCall(VecGetArray(zz, &z));

3030:   idx = a->j;
3031:   v   = a->a;
3032:   if (usecprow) {
3033:     mbs  = cprow.nrows;
3034:     ii   = cprow.i;
3035:     ridx = cprow.rindex;
3036:   } else {
3037:     mbs = a->mbs;
3038:     ii  = a->i;
3039:     xb  = x;
3040:   }

3042:   switch (bs) {
3043:   case 1:
3044:     for (i = 0; i < mbs; i++) {
3045:       if (usecprow) xb = x + ridx[i];
3046:       x1 = xb[0];
3047:       ib = idx + ii[0];
3048:       n  = ii[1] - ii[0];
3049:       ii++;
3050:       for (j = 0; j < n; j++) {
3051:         rval = ib[j];
3052:         z[rval] += *v * x1;
3053:         v++;
3054:       }
3055:       if (!usecprow) xb++;
3056:     }
3057:     break;
3058:   case 2:
3059:     for (i = 0; i < mbs; i++) {
3060:       if (usecprow) xb = x + 2 * ridx[i];
3061:       x1 = xb[0];
3062:       x2 = xb[1];
3063:       ib = idx + ii[0];
3064:       n  = ii[1] - ii[0];
3065:       ii++;
3066:       for (j = 0; j < n; j++) {
3067:         rval = ib[j] * 2;
3068:         z[rval++] += v[0] * x1 + v[1] * x2;
3069:         z[rval++] += v[2] * x1 + v[3] * x2;
3070:         v += 4;
3071:       }
3072:       if (!usecprow) xb += 2;
3073:     }
3074:     break;
3075:   case 3:
3076:     for (i = 0; i < mbs; i++) {
3077:       if (usecprow) xb = x + 3 * ridx[i];
3078:       x1 = xb[0];
3079:       x2 = xb[1];
3080:       x3 = xb[2];
3081:       ib = idx + ii[0];
3082:       n  = ii[1] - ii[0];
3083:       ii++;
3084:       for (j = 0; j < n; j++) {
3085:         rval = ib[j] * 3;
3086:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3087:         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3088:         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3089:         v += 9;
3090:       }
3091:       if (!usecprow) xb += 3;
3092:     }
3093:     break;
3094:   case 4:
3095:     for (i = 0; i < mbs; i++) {
3096:       if (usecprow) xb = x + 4 * ridx[i];
3097:       x1 = xb[0];
3098:       x2 = xb[1];
3099:       x3 = xb[2];
3100:       x4 = xb[3];
3101:       ib = idx + ii[0];
3102:       n  = ii[1] - ii[0];
3103:       ii++;
3104:       for (j = 0; j < n; j++) {
3105:         rval = ib[j] * 4;
3106:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3107:         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3108:         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3109:         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3110:         v += 16;
3111:       }
3112:       if (!usecprow) xb += 4;
3113:     }
3114:     break;
3115:   case 5:
3116:     for (i = 0; i < mbs; i++) {
3117:       if (usecprow) xb = x + 5 * ridx[i];
3118:       x1 = xb[0];
3119:       x2 = xb[1];
3120:       x3 = xb[2];
3121:       x4 = xb[3];
3122:       x5 = xb[4];
3123:       ib = idx + ii[0];
3124:       n  = ii[1] - ii[0];
3125:       ii++;
3126:       for (j = 0; j < n; j++) {
3127:         rval = ib[j] * 5;
3128:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3129:         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3130:         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3131:         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3132:         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3133:         v += 25;
3134:       }
3135:       if (!usecprow) xb += 5;
3136:     }
3137:     break;
3138:   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3139:     PetscInt           ncols, k;
3140:     PetscScalar       *work, *workt;
3141:     const PetscScalar *xtmp;
3142:     if (!a->mult_work) {
3143:       k = PetscMax(A->rmap->n, A->cmap->n);
3144:       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3145:     }
3146:     work = a->mult_work;
3147:     xtmp = x;
3148:     for (i = 0; i < mbs; i++) {
3149:       n = ii[1] - ii[0];
3150:       ii++;
3151:       ncols = n * bs;
3152:       PetscCall(PetscArrayzero(work, ncols));
3153:       if (usecprow) xtmp = x + bs * ridx[i];
3154:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3155:       v += n * bs2;
3156:       if (!usecprow) xtmp += bs;
3157:       workt = work;
3158:       for (j = 0; j < n; j++) {
3159:         zb = z + bs * (*idx++);
3160:         for (k = 0; k < bs; k++) zb[k] += workt[k];
3161:         workt += bs;
3162:       }
3163:     }
3164:   }
3165:   }
3166:   PetscCall(VecRestoreArrayRead(xx, &x));
3167:   PetscCall(VecRestoreArray(zz, &z));
3168:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3169:   PetscFunctionReturn(PETSC_SUCCESS);
3170: }

3172: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3173: {
3174:   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3175:   PetscInt     totalnz = a->bs2 * a->nz;
3176:   PetscScalar  oalpha  = alpha;
3177:   PetscBLASInt one     = 1, tnz;

3179:   PetscFunctionBegin;
3180:   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3181:   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3182:   PetscCall(PetscLogFlops(totalnz));
3183:   PetscFunctionReturn(PETSC_SUCCESS);
3184: }

3186: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3187: {
3188:   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3189:   MatScalar   *v   = a->a;
3190:   PetscReal    sum = 0.0;
3191:   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;

3193:   PetscFunctionBegin;
3194:   if (type == NORM_FROBENIUS) {
3195: #if defined(PETSC_USE_REAL___FP16)
3196:     PetscBLASInt one = 1, cnt = bs2 * nz;
3197:     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3198: #else
3199:     for (i = 0; i < bs2 * nz; i++) {
3200:       sum += PetscRealPart(PetscConj(*v) * (*v));
3201:       v++;
3202:     }
3203: #endif
3204:     *norm = PetscSqrtReal(sum);
3205:     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3206:   } else if (type == NORM_1) { /* maximum column sum */
3207:     PetscReal *tmp;
3208:     PetscInt  *bcol = a->j;
3209:     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3210:     for (i = 0; i < nz; i++) {
3211:       for (j = 0; j < bs; j++) {
3212:         k1 = bs * (*bcol) + j; /* column index */
3213:         for (k = 0; k < bs; k++) {
3214:           tmp[k1] += PetscAbsScalar(*v);
3215:           v++;
3216:         }
3217:       }
3218:       bcol++;
3219:     }
3220:     *norm = 0.0;
3221:     for (j = 0; j < A->cmap->n; j++) {
3222:       if (tmp[j] > *norm) *norm = tmp[j];
3223:     }
3224:     PetscCall(PetscFree(tmp));
3225:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3226:   } else if (type == NORM_INFINITY) { /* maximum row sum */
3227:     *norm = 0.0;
3228:     for (k = 0; k < bs; k++) {
3229:       for (j = 0; j < a->mbs; j++) {
3230:         v   = a->a + bs2 * a->i[j] + k;
3231:         sum = 0.0;
3232:         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3233:           for (k1 = 0; k1 < bs; k1++) {
3234:             sum += PetscAbsScalar(*v);
3235:             v += bs;
3236:           }
3237:         }
3238:         if (sum > *norm) *norm = sum;
3239:       }
3240:     }
3241:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3242:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3243:   PetscFunctionReturn(PETSC_SUCCESS);
3244: }

3246: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3247: {
3248:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

3250:   PetscFunctionBegin;
3251:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3252:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3253:     *flg = PETSC_FALSE;
3254:     PetscFunctionReturn(PETSC_SUCCESS);
3255:   }

3257:   /* if the a->i are the same */
3258:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3259:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3261:   /* if a->j are the same */
3262:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3263:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3265:   /* if a->a are the same */
3266:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3267:   PetscFunctionReturn(PETSC_SUCCESS);
3268: }

3270: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3271: {
3272:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3273:   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3274:   PetscScalar *x, zero = 0.0;
3275:   MatScalar   *aa, *aa_j;

3277:   PetscFunctionBegin;
3278:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3279:   bs   = A->rmap->bs;
3280:   aa   = a->a;
3281:   ai   = a->i;
3282:   aj   = a->j;
3283:   ambs = a->mbs;
3284:   bs2  = a->bs2;

3286:   PetscCall(VecSet(v, zero));
3287:   PetscCall(VecGetArray(v, &x));
3288:   PetscCall(VecGetLocalSize(v, &n));
3289:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3290:   for (i = 0; i < ambs; i++) {
3291:     for (j = ai[i]; j < ai[i + 1]; j++) {
3292:       if (aj[j] == i) {
3293:         row  = i * bs;
3294:         aa_j = aa + j * bs2;
3295:         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3296:         break;
3297:       }
3298:     }
3299:   }
3300:   PetscCall(VecRestoreArray(v, &x));
3301:   PetscFunctionReturn(PETSC_SUCCESS);
3302: }

3304: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3305: {
3306:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3307:   const PetscScalar *l, *r, *li, *ri;
3308:   PetscScalar        x;
3309:   MatScalar         *aa, *v;
3310:   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3311:   const PetscInt    *ai, *aj;

3313:   PetscFunctionBegin;
3314:   ai  = a->i;
3315:   aj  = a->j;
3316:   aa  = a->a;
3317:   m   = A->rmap->n;
3318:   n   = A->cmap->n;
3319:   bs  = A->rmap->bs;
3320:   mbs = a->mbs;
3321:   bs2 = a->bs2;
3322:   if (ll) {
3323:     PetscCall(VecGetArrayRead(ll, &l));
3324:     PetscCall(VecGetLocalSize(ll, &lm));
3325:     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3326:     for (i = 0; i < mbs; i++) { /* for each block row */
3327:       M  = ai[i + 1] - ai[i];
3328:       li = l + i * bs;
3329:       v  = aa + bs2 * ai[i];
3330:       for (j = 0; j < M; j++) { /* for each block */
3331:         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3332:       }
3333:     }
3334:     PetscCall(VecRestoreArrayRead(ll, &l));
3335:     PetscCall(PetscLogFlops(a->nz));
3336:   }

3338:   if (rr) {
3339:     PetscCall(VecGetArrayRead(rr, &r));
3340:     PetscCall(VecGetLocalSize(rr, &rn));
3341:     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3342:     for (i = 0; i < mbs; i++) { /* for each block row */
3343:       iai = ai[i];
3344:       M   = ai[i + 1] - iai;
3345:       v   = aa + bs2 * iai;
3346:       for (j = 0; j < M; j++) { /* for each block */
3347:         ri = r + bs * aj[iai + j];
3348:         for (k = 0; k < bs; k++) {
3349:           x = ri[k];
3350:           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3351:           v += bs;
3352:         }
3353:       }
3354:     }
3355:     PetscCall(VecRestoreArrayRead(rr, &r));
3356:     PetscCall(PetscLogFlops(a->nz));
3357:   }
3358:   PetscFunctionReturn(PETSC_SUCCESS);
3359: }

3361: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3362: {
3363:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3365:   PetscFunctionBegin;
3366:   info->block_size   = a->bs2;
3367:   info->nz_allocated = a->bs2 * a->maxnz;
3368:   info->nz_used      = a->bs2 * a->nz;
3369:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3370:   info->assemblies   = A->num_ass;
3371:   info->mallocs      = A->info.mallocs;
3372:   info->memory       = 0; /* REVIEW ME */
3373:   if (A->factortype) {
3374:     info->fill_ratio_given  = A->info.fill_ratio_given;
3375:     info->fill_ratio_needed = A->info.fill_ratio_needed;
3376:     info->factor_mallocs    = A->info.factor_mallocs;
3377:   } else {
3378:     info->fill_ratio_given  = 0;
3379:     info->fill_ratio_needed = 0;
3380:     info->factor_mallocs    = 0;
3381:   }
3382:   PetscFunctionReturn(PETSC_SUCCESS);
3383: }

3385: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3386: {
3387:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3389:   PetscFunctionBegin;
3390:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3391:   PetscFunctionReturn(PETSC_SUCCESS);
3392: }

3394: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3395: {
3396:   PetscFunctionBegin;
3397:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3398:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3399:   PetscFunctionReturn(PETSC_SUCCESS);
3400: }

3402: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3403: {
3404:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3405:   PetscScalar       *z = NULL, sum1;
3406:   const PetscScalar *xb;
3407:   PetscScalar        x1;
3408:   const MatScalar   *v, *vv;
3409:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3410:   PetscBool          usecprow = a->compressedrow.use;

3412:   PetscFunctionBegin;
3413:   idx = a->j;
3414:   v   = a->a;
3415:   if (usecprow) {
3416:     mbs  = a->compressedrow.nrows;
3417:     ii   = a->compressedrow.i;
3418:     ridx = a->compressedrow.rindex;
3419:   } else {
3420:     mbs = a->mbs;
3421:     ii  = a->i;
3422:     z   = c;
3423:   }

3425:   for (i = 0; i < mbs; i++) {
3426:     n = ii[1] - ii[0];
3427:     ii++;
3428:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3429:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3430:     if (usecprow) z = c + ridx[i];
3431:     jj = idx;
3432:     vv = v;
3433:     for (k = 0; k < cn; k++) {
3434:       idx  = jj;
3435:       v    = vv;
3436:       sum1 = 0.0;
3437:       for (j = 0; j < n; j++) {
3438:         xb = b + (*idx++);
3439:         x1 = xb[0 + k * bm];
3440:         sum1 += v[0] * x1;
3441:         v += 1;
3442:       }
3443:       z[0 + k * cm] = sum1;
3444:     }
3445:     if (!usecprow) z += 1;
3446:   }
3447:   PetscFunctionReturn(PETSC_SUCCESS);
3448: }

3450: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3451: {
3452:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3453:   PetscScalar       *z = NULL, sum1, sum2;
3454:   const PetscScalar *xb;
3455:   PetscScalar        x1, x2;
3456:   const MatScalar   *v, *vv;
3457:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3458:   PetscBool          usecprow = a->compressedrow.use;

3460:   PetscFunctionBegin;
3461:   idx = a->j;
3462:   v   = a->a;
3463:   if (usecprow) {
3464:     mbs  = a->compressedrow.nrows;
3465:     ii   = a->compressedrow.i;
3466:     ridx = a->compressedrow.rindex;
3467:   } else {
3468:     mbs = a->mbs;
3469:     ii  = a->i;
3470:     z   = c;
3471:   }

3473:   for (i = 0; i < mbs; i++) {
3474:     n = ii[1] - ii[0];
3475:     ii++;
3476:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3477:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3478:     if (usecprow) z = c + 2 * ridx[i];
3479:     jj = idx;
3480:     vv = v;
3481:     for (k = 0; k < cn; k++) {
3482:       idx  = jj;
3483:       v    = vv;
3484:       sum1 = 0.0;
3485:       sum2 = 0.0;
3486:       for (j = 0; j < n; j++) {
3487:         xb = b + 2 * (*idx++);
3488:         x1 = xb[0 + k * bm];
3489:         x2 = xb[1 + k * bm];
3490:         sum1 += v[0] * x1 + v[2] * x2;
3491:         sum2 += v[1] * x1 + v[3] * x2;
3492:         v += 4;
3493:       }
3494:       z[0 + k * cm] = sum1;
3495:       z[1 + k * cm] = sum2;
3496:     }
3497:     if (!usecprow) z += 2;
3498:   }
3499:   PetscFunctionReturn(PETSC_SUCCESS);
3500: }

3502: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3503: {
3504:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3505:   PetscScalar       *z = NULL, sum1, sum2, sum3;
3506:   const PetscScalar *xb;
3507:   PetscScalar        x1, x2, x3;
3508:   const MatScalar   *v, *vv;
3509:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3510:   PetscBool          usecprow = a->compressedrow.use;

3512:   PetscFunctionBegin;
3513:   idx = a->j;
3514:   v   = a->a;
3515:   if (usecprow) {
3516:     mbs  = a->compressedrow.nrows;
3517:     ii   = a->compressedrow.i;
3518:     ridx = a->compressedrow.rindex;
3519:   } else {
3520:     mbs = a->mbs;
3521:     ii  = a->i;
3522:     z   = c;
3523:   }

3525:   for (i = 0; i < mbs; i++) {
3526:     n = ii[1] - ii[0];
3527:     ii++;
3528:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3529:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3530:     if (usecprow) z = c + 3 * ridx[i];
3531:     jj = idx;
3532:     vv = v;
3533:     for (k = 0; k < cn; k++) {
3534:       idx  = jj;
3535:       v    = vv;
3536:       sum1 = 0.0;
3537:       sum2 = 0.0;
3538:       sum3 = 0.0;
3539:       for (j = 0; j < n; j++) {
3540:         xb = b + 3 * (*idx++);
3541:         x1 = xb[0 + k * bm];
3542:         x2 = xb[1 + k * bm];
3543:         x3 = xb[2 + k * bm];
3544:         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3545:         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3546:         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3547:         v += 9;
3548:       }
3549:       z[0 + k * cm] = sum1;
3550:       z[1 + k * cm] = sum2;
3551:       z[2 + k * cm] = sum3;
3552:     }
3553:     if (!usecprow) z += 3;
3554:   }
3555:   PetscFunctionReturn(PETSC_SUCCESS);
3556: }

3558: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3559: {
3560:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3561:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3562:   const PetscScalar *xb;
3563:   PetscScalar        x1, x2, x3, x4;
3564:   const MatScalar   *v, *vv;
3565:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3566:   PetscBool          usecprow = a->compressedrow.use;

3568:   PetscFunctionBegin;
3569:   idx = a->j;
3570:   v   = a->a;
3571:   if (usecprow) {
3572:     mbs  = a->compressedrow.nrows;
3573:     ii   = a->compressedrow.i;
3574:     ridx = a->compressedrow.rindex;
3575:   } else {
3576:     mbs = a->mbs;
3577:     ii  = a->i;
3578:     z   = c;
3579:   }

3581:   for (i = 0; i < mbs; i++) {
3582:     n = ii[1] - ii[0];
3583:     ii++;
3584:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3585:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3586:     if (usecprow) z = c + 4 * ridx[i];
3587:     jj = idx;
3588:     vv = v;
3589:     for (k = 0; k < cn; k++) {
3590:       idx  = jj;
3591:       v    = vv;
3592:       sum1 = 0.0;
3593:       sum2 = 0.0;
3594:       sum3 = 0.0;
3595:       sum4 = 0.0;
3596:       for (j = 0; j < n; j++) {
3597:         xb = b + 4 * (*idx++);
3598:         x1 = xb[0 + k * bm];
3599:         x2 = xb[1 + k * bm];
3600:         x3 = xb[2 + k * bm];
3601:         x4 = xb[3 + k * bm];
3602:         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3603:         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3604:         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3605:         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3606:         v += 16;
3607:       }
3608:       z[0 + k * cm] = sum1;
3609:       z[1 + k * cm] = sum2;
3610:       z[2 + k * cm] = sum3;
3611:       z[3 + k * cm] = sum4;
3612:     }
3613:     if (!usecprow) z += 4;
3614:   }
3615:   PetscFunctionReturn(PETSC_SUCCESS);
3616: }

3618: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3619: {
3620:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3621:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3622:   const PetscScalar *xb;
3623:   PetscScalar        x1, x2, x3, x4, x5;
3624:   const MatScalar   *v, *vv;
3625:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3626:   PetscBool          usecprow = a->compressedrow.use;

3628:   PetscFunctionBegin;
3629:   idx = a->j;
3630:   v   = a->a;
3631:   if (usecprow) {
3632:     mbs  = a->compressedrow.nrows;
3633:     ii   = a->compressedrow.i;
3634:     ridx = a->compressedrow.rindex;
3635:   } else {
3636:     mbs = a->mbs;
3637:     ii  = a->i;
3638:     z   = c;
3639:   }

3641:   for (i = 0; i < mbs; i++) {
3642:     n = ii[1] - ii[0];
3643:     ii++;
3644:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3645:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3646:     if (usecprow) z = c + 5 * ridx[i];
3647:     jj = idx;
3648:     vv = v;
3649:     for (k = 0; k < cn; k++) {
3650:       idx  = jj;
3651:       v    = vv;
3652:       sum1 = 0.0;
3653:       sum2 = 0.0;
3654:       sum3 = 0.0;
3655:       sum4 = 0.0;
3656:       sum5 = 0.0;
3657:       for (j = 0; j < n; j++) {
3658:         xb = b + 5 * (*idx++);
3659:         x1 = xb[0 + k * bm];
3660:         x2 = xb[1 + k * bm];
3661:         x3 = xb[2 + k * bm];
3662:         x4 = xb[3 + k * bm];
3663:         x5 = xb[4 + k * bm];
3664:         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3665:         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3666:         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3667:         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3668:         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3669:         v += 25;
3670:       }
3671:       z[0 + k * cm] = sum1;
3672:       z[1 + k * cm] = sum2;
3673:       z[2 + k * cm] = sum3;
3674:       z[3 + k * cm] = sum4;
3675:       z[4 + k * cm] = sum5;
3676:     }
3677:     if (!usecprow) z += 5;
3678:   }
3679:   PetscFunctionReturn(PETSC_SUCCESS);
3680: }

3682: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3683: {
3684:   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3685:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3686:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3687:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3688:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3689:   PetscBLASInt     bbs, bcn, bbm, bcm;
3690:   PetscScalar     *z = NULL;
3691:   PetscScalar     *c, *b;
3692:   const MatScalar *v;
3693:   const PetscInt  *idx, *ii, *ridx = NULL;
3694:   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3695:   PetscBool        usecprow = a->compressedrow.use;

3697:   PetscFunctionBegin;
3698:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3699:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3700:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3701:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3702:   b = bd->v;
3703:   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3704:   PetscCall(MatDenseGetArray(C, &c));
3705:   switch (bs) {
3706:   case 1:
3707:     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3708:     break;
3709:   case 2:
3710:     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3711:     break;
3712:   case 3:
3713:     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3714:     break;
3715:   case 4:
3716:     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3717:     break;
3718:   case 5:
3719:     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3720:     break;
3721:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3722:     PetscCall(PetscBLASIntCast(bs, &bbs));
3723:     PetscCall(PetscBLASIntCast(cn, &bcn));
3724:     PetscCall(PetscBLASIntCast(bm, &bbm));
3725:     PetscCall(PetscBLASIntCast(cm, &bcm));
3726:     idx = a->j;
3727:     v   = a->a;
3728:     if (usecprow) {
3729:       mbs  = a->compressedrow.nrows;
3730:       ii   = a->compressedrow.i;
3731:       ridx = a->compressedrow.rindex;
3732:     } else {
3733:       mbs = a->mbs;
3734:       ii  = a->i;
3735:       z   = c;
3736:     }
3737:     for (i = 0; i < mbs; i++) {
3738:       n = ii[1] - ii[0];
3739:       ii++;
3740:       if (usecprow) z = c + bs * ridx[i];
3741:       if (n) {
3742:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3743:         v += bs2;
3744:       }
3745:       for (j = 1; j < n; j++) {
3746:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3747:         v += bs2;
3748:       }
3749:       if (!usecprow) z += bs;
3750:     }
3751:   }
3752:   PetscCall(MatDenseRestoreArray(C, &c));
3753:   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3754:   PetscFunctionReturn(PETSC_SUCCESS);
3755: }