Actual source code: sbaij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  5: #include <petsc/private/kernels/blockinvert.h>
  6: #include <petscbt.h>
  7: #include <petscblaslapack.h>

  9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 10: {
 11:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
 12:   PetscInt        brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
 13:   const PetscInt *idx;
 14:   PetscBT         table_out, table_in;

 16:   PetscFunctionBegin;
 17:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
 18:   mbs = a->mbs;
 19:   ai  = a->i;
 20:   aj  = a->j;
 21:   bs  = A->rmap->bs;
 22:   PetscCall(PetscBTCreate(mbs, &table_out));
 23:   PetscCall(PetscMalloc1(mbs + 1, &nidx));
 24:   PetscCall(PetscBTCreate(mbs, &table_in));

 26:   for (i = 0; i < is_max; i++) { /* for each is */
 27:     isz = 0;
 28:     PetscCall(PetscBTMemzero(mbs, table_out));

 30:     /* Extract the indices, assume there can be duplicate entries */
 31:     PetscCall(ISGetIndices(is[i], &idx));
 32:     PetscCall(ISGetLocalSize(is[i], &n));

 34:     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
 35:     bcol_max = 0;
 36:     for (j = 0; j < n; ++j) {
 37:       brow = idx[j] / bs; /* convert the indices into block indices */
 38:       PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 39:       if (!PetscBTLookupSet(table_out, brow)) {
 40:         nidx[isz++] = brow;
 41:         if (bcol_max < brow) bcol_max = brow;
 42:       }
 43:     }
 44:     PetscCall(ISRestoreIndices(is[i], &idx));
 45:     PetscCall(ISDestroy(&is[i]));

 47:     k = 0;
 48:     for (j = 0; j < ov; j++) { /* for each overlap */
 49:       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
 50:       PetscCall(PetscBTMemzero(mbs, table_in));
 51:       for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));

 53:       n = isz; /* length of the updated is[i] */
 54:       for (brow = 0; brow < mbs; brow++) {
 55:         start = ai[brow];
 56:         end   = ai[brow + 1];
 57:         if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
 58:           for (l = start; l < end; l++) {
 59:             bcol = aj[l];
 60:             if (!PetscBTLookupSet(table_out, bcol)) {
 61:               nidx[isz++] = bcol;
 62:               if (bcol_max < bcol) bcol_max = bcol;
 63:             }
 64:           }
 65:           k++;
 66:           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
 67:         } else {             /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
 68:           for (l = start; l < end; l++) {
 69:             bcol = aj[l];
 70:             if (bcol > bcol_max) break;
 71:             if (PetscBTLookup(table_in, bcol)) {
 72:               if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
 73:               break; /* for l = start; l<end ; l++) */
 74:             }
 75:           }
 76:         }
 77:       }
 78:     } /* for each overlap */
 79:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 80:   } /* for each is */
 81:   PetscCall(PetscBTDestroy(&table_out));
 82:   PetscCall(PetscFree(nidx));
 83:   PetscCall(PetscBTDestroy(&table_in));
 84:   PetscFunctionReturn(PETSC_SUCCESS);
 85: }

 87: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
 88:         Zero some ops' to avoid invalid usse */
 89: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
 90: {
 91:   PetscFunctionBegin;
 92:   PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
 93:   Bseq->ops->mult                   = NULL;
 94:   Bseq->ops->multadd                = NULL;
 95:   Bseq->ops->multtranspose          = NULL;
 96:   Bseq->ops->multtransposeadd       = NULL;
 97:   Bseq->ops->lufactor               = NULL;
 98:   Bseq->ops->choleskyfactor         = NULL;
 99:   Bseq->ops->lufactorsymbolic       = NULL;
100:   Bseq->ops->choleskyfactorsymbolic = NULL;
101:   Bseq->ops->getinertia             = NULL;
102:   PetscFunctionReturn(PETSC_SUCCESS);
103: }

105: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107: {
108:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *c;
109:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
111:   const PetscInt *irow, *icol;
112:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113:   PetscInt       *aj = a->j, *ai = a->i;
114:   MatScalar      *mat_a;
115:   Mat             C;
116:   PetscBool       flag;

118:   PetscFunctionBegin;

120:   PetscCall(ISGetIndices(isrow, &irow));
121:   PetscCall(ISGetIndices(iscol, &icol));
122:   PetscCall(ISGetLocalSize(isrow, &nrows));
123:   PetscCall(ISGetLocalSize(iscol, &ncols));

125:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
126:   ssmap = smap;
127:   PetscCall(PetscMalloc1(1 + nrows, &lens));
128:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129:   /* determine lens of each row */
130:   for (i = 0; i < nrows; i++) {
131:     kstart  = ai[irow[i]];
132:     kend    = kstart + a->ilen[irow[i]];
133:     lens[i] = 0;
134:     for (k = kstart; k < kend; k++) {
135:       if (ssmap[aj[k]]) lens[i]++;
136:     }
137:   }
138:   /* Create and fill new matrix */
139:   if (scall == MAT_REUSE_MATRIX) {
140:     c = (Mat_SeqSBAIJ *)((*B)->data);

142:     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143:     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144:     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145:     PetscCall(PetscArrayzero(c->ilen, c->mbs));
146:     C = *B;
147:   } else {
148:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151:     PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152:   }
153:   c = (Mat_SeqSBAIJ *)(C->data);
154:   for (i = 0; i < nrows; i++) {
155:     row      = irow[i];
156:     kstart   = ai[row];
157:     kend     = kstart + a->ilen[row];
158:     mat_i    = c->i[i];
159:     mat_j    = c->j + mat_i;
160:     mat_a    = c->a + mat_i * bs2;
161:     mat_ilen = c->ilen + i;
162:     for (k = kstart; k < kend; k++) {
163:       if ((tcol = ssmap[a->j[k]])) {
164:         *mat_j++ = tcol - 1;
165:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166:         mat_a += bs2;
167:         (*mat_ilen)++;
168:       }
169:     }
170:   }
171:   /* sort */
172:   {
173:     MatScalar *work;

175:     PetscCall(PetscMalloc1(bs2, &work));
176:     for (i = 0; i < nrows; i++) {
177:       PetscInt ilen;
178:       mat_i = c->i[i];
179:       mat_j = c->j + mat_i;
180:       mat_a = c->a + mat_i * bs2;
181:       ilen  = c->ilen[i];
182:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183:     }
184:     PetscCall(PetscFree(work));
185:   }

187:   /* Free work space */
188:   PetscCall(ISRestoreIndices(iscol, &icol));
189:   PetscCall(PetscFree(smap));
190:   PetscCall(PetscFree(lens));
191:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

194:   PetscCall(ISRestoreIndices(isrow, &irow));
195:   *B = C;
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200: {
201:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
202:   IS              is1, is2;
203:   PetscInt       *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
204:   const PetscInt *irow, *icol;

206:   PetscFunctionBegin;
207:   PetscCall(ISGetIndices(isrow, &irow));
208:   PetscCall(ISGetIndices(iscol, &icol));
209:   PetscCall(ISGetLocalSize(isrow, &nrows));
210:   PetscCall(ISGetLocalSize(iscol, &ncols));

212:   /* Verify if the indices correspond to each element in a block
213:    and form the IS with compressed IS */
214:   maxmnbs = PetscMax(a->mbs, a->nbs);
215:   PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
216:   PetscCall(PetscArrayzero(vary, a->mbs));
217:   for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
218:   for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
219:   count = 0;
220:   for (i = 0; i < nrows; i++) {
221:     PetscInt j = irow[i] / bs;
222:     if ((vary[j]--) == bs) iary[count++] = j;
223:   }
224:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));

226:   PetscCall(PetscArrayzero(vary, a->nbs));
227:   for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
228:   for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
229:   count = 0;
230:   for (i = 0; i < ncols; i++) {
231:     PetscInt j = icol[i] / bs;
232:     if ((vary[j]--) == bs) iary[count++] = j;
233:   }
234:   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
235:   PetscCall(ISRestoreIndices(isrow, &irow));
236:   PetscCall(ISRestoreIndices(iscol, &icol));
237:   PetscCall(PetscFree2(vary, iary));

239:   PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
240:   PetscCall(ISDestroy(&is1));
241:   PetscCall(ISDestroy(&is2));

243:   if (isrow != iscol) {
244:     PetscBool isequal;
245:     PetscCall(ISEqual(isrow, iscol, &isequal));
246:     if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
247:   }
248:   PetscFunctionReturn(PETSC_SUCCESS);
249: }

251: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
252: {
253:   PetscInt i;

255:   PetscFunctionBegin;
256:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));

258:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
259:   PetscFunctionReturn(PETSC_SUCCESS);
260: }

262: /* Should check that shapes of vectors and matrices match */
263: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
264: {
265:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
266:   PetscScalar       *z, x1, x2, zero = 0.0;
267:   const PetscScalar *x, *xb;
268:   const MatScalar   *v;
269:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
270:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
271:   PetscInt           nonzerorow = 0;

273:   PetscFunctionBegin;
274:   PetscCall(VecSet(zz, zero));
275:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
276:   PetscCall(VecGetArrayRead(xx, &x));
277:   PetscCall(VecGetArray(zz, &z));

279:   v  = a->a;
280:   xb = x;

282:   for (i = 0; i < mbs; i++) {
283:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
284:     x1   = xb[0];
285:     x2   = xb[1];
286:     ib   = aj + *ai;
287:     jmin = 0;
288:     nonzerorow += (n > 0);
289:     if (*ib == i) { /* (diag of A)*x */
290:       z[2 * i] += v[0] * x1 + v[2] * x2;
291:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
292:       v += 4;
293:       jmin++;
294:     }
295:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
296:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297:     for (j = jmin; j < n; j++) {
298:       /* (strict lower triangular part of A)*x  */
299:       cval = ib[j] * 2;
300:       z[cval] += v[0] * x1 + v[1] * x2;
301:       z[cval + 1] += v[2] * x1 + v[3] * x2;
302:       /* (strict upper triangular part of A)*x  */
303:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
304:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
305:       v += 4;
306:     }
307:     xb += 2;
308:     ai++;
309:   }

311:   PetscCall(VecRestoreArrayRead(xx, &x));
312:   PetscCall(VecRestoreArray(zz, &z));
313:   PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
314:   PetscFunctionReturn(PETSC_SUCCESS);
315: }

317: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
318: {
319:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
320:   PetscScalar       *z, x1, x2, x3, zero = 0.0;
321:   const PetscScalar *x, *xb;
322:   const MatScalar   *v;
323:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
324:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
325:   PetscInt           nonzerorow = 0;

327:   PetscFunctionBegin;
328:   PetscCall(VecSet(zz, zero));
329:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
330:   PetscCall(VecGetArrayRead(xx, &x));
331:   PetscCall(VecGetArray(zz, &z));

333:   v  = a->a;
334:   xb = x;

336:   for (i = 0; i < mbs; i++) {
337:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
338:     x1   = xb[0];
339:     x2   = xb[1];
340:     x3   = xb[2];
341:     ib   = aj + *ai;
342:     jmin = 0;
343:     nonzerorow += (n > 0);
344:     if (*ib == i) { /* (diag of A)*x */
345:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
346:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
347:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
348:       v += 9;
349:       jmin++;
350:     }
351:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
352:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353:     for (j = jmin; j < n; j++) {
354:       /* (strict lower triangular part of A)*x  */
355:       cval = ib[j] * 3;
356:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
357:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
358:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
359:       /* (strict upper triangular part of A)*x  */
360:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
361:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
362:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
363:       v += 9;
364:     }
365:     xb += 3;
366:     ai++;
367:   }

369:   PetscCall(VecRestoreArrayRead(xx, &x));
370:   PetscCall(VecRestoreArray(zz, &z));
371:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
372:   PetscFunctionReturn(PETSC_SUCCESS);
373: }

375: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
376: {
377:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
378:   PetscScalar       *z, x1, x2, x3, x4, zero = 0.0;
379:   const PetscScalar *x, *xb;
380:   const MatScalar   *v;
381:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
382:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
383:   PetscInt           nonzerorow = 0;

385:   PetscFunctionBegin;
386:   PetscCall(VecSet(zz, zero));
387:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
388:   PetscCall(VecGetArrayRead(xx, &x));
389:   PetscCall(VecGetArray(zz, &z));

391:   v  = a->a;
392:   xb = x;

394:   for (i = 0; i < mbs; i++) {
395:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
396:     x1   = xb[0];
397:     x2   = xb[1];
398:     x3   = xb[2];
399:     x4   = xb[3];
400:     ib   = aj + *ai;
401:     jmin = 0;
402:     nonzerorow += (n > 0);
403:     if (*ib == i) { /* (diag of A)*x */
404:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
405:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
406:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
407:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
408:       v += 16;
409:       jmin++;
410:     }
411:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
412:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
413:     for (j = jmin; j < n; j++) {
414:       /* (strict lower triangular part of A)*x  */
415:       cval = ib[j] * 4;
416:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
417:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
418:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
419:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
420:       /* (strict upper triangular part of A)*x  */
421:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
422:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
423:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
424:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
425:       v += 16;
426:     }
427:     xb += 4;
428:     ai++;
429:   }

431:   PetscCall(VecRestoreArrayRead(xx, &x));
432:   PetscCall(VecRestoreArray(zz, &z));
433:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
434:   PetscFunctionReturn(PETSC_SUCCESS);
435: }

437: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
438: {
439:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
440:   PetscScalar       *z, x1, x2, x3, x4, x5, zero = 0.0;
441:   const PetscScalar *x, *xb;
442:   const MatScalar   *v;
443:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
444:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
445:   PetscInt           nonzerorow = 0;

447:   PetscFunctionBegin;
448:   PetscCall(VecSet(zz, zero));
449:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
450:   PetscCall(VecGetArrayRead(xx, &x));
451:   PetscCall(VecGetArray(zz, &z));

453:   v  = a->a;
454:   xb = x;

456:   for (i = 0; i < mbs; i++) {
457:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
458:     x1   = xb[0];
459:     x2   = xb[1];
460:     x3   = xb[2];
461:     x4   = xb[3];
462:     x5   = xb[4];
463:     ib   = aj + *ai;
464:     jmin = 0;
465:     nonzerorow += (n > 0);
466:     if (*ib == i) { /* (diag of A)*x */
467:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
468:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
469:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
470:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
471:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
472:       v += 25;
473:       jmin++;
474:     }
475:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
476:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
477:     for (j = jmin; j < n; j++) {
478:       /* (strict lower triangular part of A)*x  */
479:       cval = ib[j] * 5;
480:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
481:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
482:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
483:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
484:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
485:       /* (strict upper triangular part of A)*x  */
486:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
487:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
488:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
489:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
490:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
491:       v += 25;
492:     }
493:     xb += 5;
494:     ai++;
495:   }

497:   PetscCall(VecRestoreArrayRead(xx, &x));
498:   PetscCall(VecRestoreArray(zz, &z));
499:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
500:   PetscFunctionReturn(PETSC_SUCCESS);
501: }

503: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
504: {
505:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
506:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
507:   const PetscScalar *x, *xb;
508:   const MatScalar   *v;
509:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
510:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
511:   PetscInt           nonzerorow = 0;

513:   PetscFunctionBegin;
514:   PetscCall(VecSet(zz, zero));
515:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
516:   PetscCall(VecGetArrayRead(xx, &x));
517:   PetscCall(VecGetArray(zz, &z));

519:   v  = a->a;
520:   xb = x;

522:   for (i = 0; i < mbs; i++) {
523:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
524:     x1   = xb[0];
525:     x2   = xb[1];
526:     x3   = xb[2];
527:     x4   = xb[3];
528:     x5   = xb[4];
529:     x6   = xb[5];
530:     ib   = aj + *ai;
531:     jmin = 0;
532:     nonzerorow += (n > 0);
533:     if (*ib == i) { /* (diag of A)*x */
534:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540:       v += 36;
541:       jmin++;
542:     }
543:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
544:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545:     for (j = jmin; j < n; j++) {
546:       /* (strict lower triangular part of A)*x  */
547:       cval = ib[j] * 6;
548:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554:       /* (strict upper triangular part of A)*x  */
555:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561:       v += 36;
562:     }
563:     xb += 6;
564:     ai++;
565:   }

567:   PetscCall(VecRestoreArrayRead(xx, &x));
568:   PetscCall(VecRestoreArray(zz, &z));
569:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
570:   PetscFunctionReturn(PETSC_SUCCESS);
571: }

573: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
574: {
575:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
576:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
577:   const PetscScalar *x, *xb;
578:   const MatScalar   *v;
579:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
580:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
581:   PetscInt           nonzerorow = 0;

583:   PetscFunctionBegin;
584:   PetscCall(VecSet(zz, zero));
585:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
586:   PetscCall(VecGetArrayRead(xx, &x));
587:   PetscCall(VecGetArray(zz, &z));

589:   v  = a->a;
590:   xb = x;

592:   for (i = 0; i < mbs; i++) {
593:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
594:     x1   = xb[0];
595:     x2   = xb[1];
596:     x3   = xb[2];
597:     x4   = xb[3];
598:     x5   = xb[4];
599:     x6   = xb[5];
600:     x7   = xb[6];
601:     ib   = aj + *ai;
602:     jmin = 0;
603:     nonzerorow += (n > 0);
604:     if (*ib == i) { /* (diag of A)*x */
605:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
606:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
607:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
608:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
609:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
610:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
611:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
612:       v += 49;
613:       jmin++;
614:     }
615:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
616:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
617:     for (j = jmin; j < n; j++) {
618:       /* (strict lower triangular part of A)*x  */
619:       cval = ib[j] * 7;
620:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
621:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
622:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
623:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
624:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
625:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
626:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
627:       /* (strict upper triangular part of A)*x  */
628:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
629:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
630:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
631:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
632:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
633:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
634:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
635:       v += 49;
636:     }
637:     xb += 7;
638:     ai++;
639:   }
640:   PetscCall(VecRestoreArrayRead(xx, &x));
641:   PetscCall(VecRestoreArray(zz, &z));
642:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
643:   PetscFunctionReturn(PETSC_SUCCESS);
644: }

646: /*
647:     This will not work with MatScalar == float because it calls the BLAS
648: */
649: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
650: {
651:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
652:   PetscScalar       *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
653:   const PetscScalar *x, *x_ptr, *xb;
654:   const MatScalar   *v;
655:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
656:   const PetscInt    *idx, *aj, *ii;
657:   PetscInt           nonzerorow = 0;

659:   PetscFunctionBegin;
660:   PetscCall(VecSet(zz, zero));
661:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
662:   PetscCall(VecGetArrayRead(xx, &x));
663:   PetscCall(VecGetArray(zz, &z));

665:   x_ptr = x;
666:   z_ptr = z;

668:   aj = a->j;
669:   v  = a->a;
670:   ii = a->i;

672:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
673:   work = a->mult_work;

675:   for (i = 0; i < mbs; i++) {
676:     n     = ii[1] - ii[0];
677:     ncols = n * bs;
678:     workt = work;
679:     idx   = aj + ii[0];
680:     nonzerorow += (n > 0);

682:     /* upper triangular part */
683:     for (j = 0; j < n; j++) {
684:       xb = x_ptr + bs * (*idx++);
685:       for (k = 0; k < bs; k++) workt[k] = xb[k];
686:       workt += bs;
687:     }
688:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
689:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

691:     /* strict lower triangular part */
692:     idx = aj + ii[0];
693:     if (n && *idx == i) {
694:       ncols -= bs;
695:       v += bs2;
696:       idx++;
697:       n--;
698:     }

700:     if (ncols > 0) {
701:       workt = work;
702:       PetscCall(PetscArrayzero(workt, ncols));
703:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
704:       for (j = 0; j < n; j++) {
705:         zb = z_ptr + bs * (*idx++);
706:         for (k = 0; k < bs; k++) zb[k] += workt[k];
707:         workt += bs;
708:       }
709:     }
710:     x += bs;
711:     v += n * bs2;
712:     z += bs;
713:     ii++;
714:   }

716:   PetscCall(VecRestoreArrayRead(xx, &x));
717:   PetscCall(VecRestoreArray(zz, &z));
718:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
719:   PetscFunctionReturn(PETSC_SUCCESS);
720: }

722: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
723: {
724:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
725:   PetscScalar       *z, x1;
726:   const PetscScalar *x, *xb;
727:   const MatScalar   *v;
728:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
729:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
730:   PetscInt           nonzerorow = 0;
731: #if defined(PETSC_USE_COMPLEX)
732:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
733: #else
734:   const int aconj = 0;
735: #endif

737:   PetscFunctionBegin;
738:   PetscCall(VecCopy(yy, zz));
739:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
740:   PetscCall(VecGetArrayRead(xx, &x));
741:   PetscCall(VecGetArray(zz, &z));
742:   v  = a->a;
743:   xb = x;

745:   for (i = 0; i < mbs; i++) {
746:     n    = ai[1] - ai[0]; /* length of i_th row of A */
747:     x1   = xb[0];
748:     ib   = aj + *ai;
749:     jmin = 0;
750:     nonzerorow += (n > 0);
751:     if (n && *ib == i) { /* (diag of A)*x */
752:       z[i] += *v++ * x[*ib++];
753:       jmin++;
754:     }
755:     if (aconj) {
756:       for (j = jmin; j < n; j++) {
757:         cval = *ib;
758:         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
759:         z[i] += *v++ * x[*ib++];       /* (strict upper triangular part of A)*x  */
760:       }
761:     } else {
762:       for (j = jmin; j < n; j++) {
763:         cval = *ib;
764:         z[cval] += *v * x1;      /* (strict lower triangular part of A)*x  */
765:         z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
766:       }
767:     }
768:     xb++;
769:     ai++;
770:   }

772:   PetscCall(VecRestoreArrayRead(xx, &x));
773:   PetscCall(VecRestoreArray(zz, &z));

775:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
776:   PetscFunctionReturn(PETSC_SUCCESS);
777: }

779: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
780: {
781:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
782:   PetscScalar       *z, x1, x2;
783:   const PetscScalar *x, *xb;
784:   const MatScalar   *v;
785:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
786:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
787:   PetscInt           nonzerorow = 0;

789:   PetscFunctionBegin;
790:   PetscCall(VecCopy(yy, zz));
791:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
792:   PetscCall(VecGetArrayRead(xx, &x));
793:   PetscCall(VecGetArray(zz, &z));

795:   v  = a->a;
796:   xb = x;

798:   for (i = 0; i < mbs; i++) {
799:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
800:     x1   = xb[0];
801:     x2   = xb[1];
802:     ib   = aj + *ai;
803:     jmin = 0;
804:     nonzerorow += (n > 0);
805:     if (n && *ib == i) { /* (diag of A)*x */
806:       z[2 * i] += v[0] * x1 + v[2] * x2;
807:       z[2 * i + 1] += v[2] * x1 + v[3] * x2;
808:       v += 4;
809:       jmin++;
810:     }
811:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
812:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
813:     for (j = jmin; j < n; j++) {
814:       /* (strict lower triangular part of A)*x  */
815:       cval = ib[j] * 2;
816:       z[cval] += v[0] * x1 + v[1] * x2;
817:       z[cval + 1] += v[2] * x1 + v[3] * x2;
818:       /* (strict upper triangular part of A)*x  */
819:       z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
820:       z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
821:       v += 4;
822:     }
823:     xb += 2;
824:     ai++;
825:   }
826:   PetscCall(VecRestoreArrayRead(xx, &x));
827:   PetscCall(VecRestoreArray(zz, &z));

829:   PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
830:   PetscFunctionReturn(PETSC_SUCCESS);
831: }

833: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
834: {
835:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
836:   PetscScalar       *z, x1, x2, x3;
837:   const PetscScalar *x, *xb;
838:   const MatScalar   *v;
839:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
840:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
841:   PetscInt           nonzerorow = 0;

843:   PetscFunctionBegin;
844:   PetscCall(VecCopy(yy, zz));
845:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
846:   PetscCall(VecGetArrayRead(xx, &x));
847:   PetscCall(VecGetArray(zz, &z));

849:   v  = a->a;
850:   xb = x;

852:   for (i = 0; i < mbs; i++) {
853:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
854:     x1   = xb[0];
855:     x2   = xb[1];
856:     x3   = xb[2];
857:     ib   = aj + *ai;
858:     jmin = 0;
859:     nonzerorow += (n > 0);
860:     if (n && *ib == i) { /* (diag of A)*x */
861:       z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
862:       z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
863:       z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
864:       v += 9;
865:       jmin++;
866:     }
867:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
868:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
869:     for (j = jmin; j < n; j++) {
870:       /* (strict lower triangular part of A)*x  */
871:       cval = ib[j] * 3;
872:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
873:       z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
874:       z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
875:       /* (strict upper triangular part of A)*x  */
876:       z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
877:       z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
878:       z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
879:       v += 9;
880:     }
881:     xb += 3;
882:     ai++;
883:   }

885:   PetscCall(VecRestoreArrayRead(xx, &x));
886:   PetscCall(VecRestoreArray(zz, &z));

888:   PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
889:   PetscFunctionReturn(PETSC_SUCCESS);
890: }

892: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
893: {
894:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
895:   PetscScalar       *z, x1, x2, x3, x4;
896:   const PetscScalar *x, *xb;
897:   const MatScalar   *v;
898:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
899:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
900:   PetscInt           nonzerorow = 0;

902:   PetscFunctionBegin;
903:   PetscCall(VecCopy(yy, zz));
904:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
905:   PetscCall(VecGetArrayRead(xx, &x));
906:   PetscCall(VecGetArray(zz, &z));

908:   v  = a->a;
909:   xb = x;

911:   for (i = 0; i < mbs; i++) {
912:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
913:     x1   = xb[0];
914:     x2   = xb[1];
915:     x3   = xb[2];
916:     x4   = xb[3];
917:     ib   = aj + *ai;
918:     jmin = 0;
919:     nonzerorow += (n > 0);
920:     if (n && *ib == i) { /* (diag of A)*x */
921:       z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
922:       z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
923:       z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
924:       z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
925:       v += 16;
926:       jmin++;
927:     }
928:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
929:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930:     for (j = jmin; j < n; j++) {
931:       /* (strict lower triangular part of A)*x  */
932:       cval = ib[j] * 4;
933:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
934:       z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
935:       z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
936:       z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
937:       /* (strict upper triangular part of A)*x  */
938:       z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
939:       z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
940:       z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
941:       z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
942:       v += 16;
943:     }
944:     xb += 4;
945:     ai++;
946:   }

948:   PetscCall(VecRestoreArrayRead(xx, &x));
949:   PetscCall(VecRestoreArray(zz, &z));

951:   PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
952:   PetscFunctionReturn(PETSC_SUCCESS);
953: }

955: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
956: {
957:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
958:   PetscScalar       *z, x1, x2, x3, x4, x5;
959:   const PetscScalar *x, *xb;
960:   const MatScalar   *v;
961:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
962:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
963:   PetscInt           nonzerorow = 0;

965:   PetscFunctionBegin;
966:   PetscCall(VecCopy(yy, zz));
967:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
968:   PetscCall(VecGetArrayRead(xx, &x));
969:   PetscCall(VecGetArray(zz, &z));

971:   v  = a->a;
972:   xb = x;

974:   for (i = 0; i < mbs; i++) {
975:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
976:     x1   = xb[0];
977:     x2   = xb[1];
978:     x3   = xb[2];
979:     x4   = xb[3];
980:     x5   = xb[4];
981:     ib   = aj + *ai;
982:     jmin = 0;
983:     nonzerorow += (n > 0);
984:     if (n && *ib == i) { /* (diag of A)*x */
985:       z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
986:       z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
987:       z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
988:       z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
989:       z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
990:       v += 25;
991:       jmin++;
992:     }
993:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
994:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
995:     for (j = jmin; j < n; j++) {
996:       /* (strict lower triangular part of A)*x  */
997:       cval = ib[j] * 5;
998:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
999:       z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1000:       z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1001:       z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1002:       z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1003:       /* (strict upper triangular part of A)*x  */
1004:       z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1005:       z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1006:       z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1007:       z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1008:       z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1009:       v += 25;
1010:     }
1011:     xb += 5;
1012:     ai++;
1013:   }

1015:   PetscCall(VecRestoreArrayRead(xx, &x));
1016:   PetscCall(VecRestoreArray(zz, &z));

1018:   PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1019:   PetscFunctionReturn(PETSC_SUCCESS);
1020: }

1022: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1023: {
1024:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1025:   PetscScalar       *z, x1, x2, x3, x4, x5, x6;
1026:   const PetscScalar *x, *xb;
1027:   const MatScalar   *v;
1028:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1029:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1030:   PetscInt           nonzerorow = 0;

1032:   PetscFunctionBegin;
1033:   PetscCall(VecCopy(yy, zz));
1034:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1035:   PetscCall(VecGetArrayRead(xx, &x));
1036:   PetscCall(VecGetArray(zz, &z));

1038:   v  = a->a;
1039:   xb = x;

1041:   for (i = 0; i < mbs; i++) {
1042:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1043:     x1   = xb[0];
1044:     x2   = xb[1];
1045:     x3   = xb[2];
1046:     x4   = xb[3];
1047:     x5   = xb[4];
1048:     x6   = xb[5];
1049:     ib   = aj + *ai;
1050:     jmin = 0;
1051:     nonzerorow += (n > 0);
1052:     if (n && *ib == i) { /* (diag of A)*x */
1053:       z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1054:       z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1055:       z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1056:       z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1057:       z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1058:       z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1059:       v += 36;
1060:       jmin++;
1061:     }
1062:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1063:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1064:     for (j = jmin; j < n; j++) {
1065:       /* (strict lower triangular part of A)*x  */
1066:       cval = ib[j] * 6;
1067:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1068:       z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1069:       z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1070:       z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1071:       z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1072:       z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1073:       /* (strict upper triangular part of A)*x  */
1074:       z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1075:       z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1076:       z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1077:       z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1078:       z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1079:       z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1080:       v += 36;
1081:     }
1082:     xb += 6;
1083:     ai++;
1084:   }

1086:   PetscCall(VecRestoreArrayRead(xx, &x));
1087:   PetscCall(VecRestoreArray(zz, &z));

1089:   PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1090:   PetscFunctionReturn(PETSC_SUCCESS);
1091: }

1093: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1094: {
1095:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1096:   PetscScalar       *z, x1, x2, x3, x4, x5, x6, x7;
1097:   const PetscScalar *x, *xb;
1098:   const MatScalar   *v;
1099:   PetscInt           mbs = a->mbs, i, n, cval, j, jmin;
1100:   const PetscInt    *aj = a->j, *ai = a->i, *ib;
1101:   PetscInt           nonzerorow = 0;

1103:   PetscFunctionBegin;
1104:   PetscCall(VecCopy(yy, zz));
1105:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1106:   PetscCall(VecGetArrayRead(xx, &x));
1107:   PetscCall(VecGetArray(zz, &z));

1109:   v  = a->a;
1110:   xb = x;

1112:   for (i = 0; i < mbs; i++) {
1113:     n    = ai[1] - ai[0]; /* length of i_th block row of A */
1114:     x1   = xb[0];
1115:     x2   = xb[1];
1116:     x3   = xb[2];
1117:     x4   = xb[3];
1118:     x5   = xb[4];
1119:     x6   = xb[5];
1120:     x7   = xb[6];
1121:     ib   = aj + *ai;
1122:     jmin = 0;
1123:     nonzerorow += (n > 0);
1124:     if (n && *ib == i) { /* (diag of A)*x */
1125:       z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1126:       z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1127:       z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1128:       z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1129:       z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1130:       z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1131:       z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1132:       v += 49;
1133:       jmin++;
1134:     }
1135:     PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1136:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1137:     for (j = jmin; j < n; j++) {
1138:       /* (strict lower triangular part of A)*x  */
1139:       cval = ib[j] * 7;
1140:       z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1141:       z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1142:       z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1143:       z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1144:       z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1145:       z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1146:       z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1147:       /* (strict upper triangular part of A)*x  */
1148:       z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1149:       z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1150:       z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1151:       z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1152:       z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1153:       z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1154:       z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1155:       v += 49;
1156:     }
1157:     xb += 7;
1158:     ai++;
1159:   }

1161:   PetscCall(VecRestoreArrayRead(xx, &x));
1162:   PetscCall(VecRestoreArray(zz, &z));

1164:   PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1165:   PetscFunctionReturn(PETSC_SUCCESS);
1166: }

1168: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1169: {
1170:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1171:   PetscScalar       *z, *z_ptr = NULL, *zb, *work, *workt;
1172:   const PetscScalar *x, *x_ptr, *xb;
1173:   const MatScalar   *v;
1174:   PetscInt           mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1175:   const PetscInt    *idx, *aj, *ii;
1176:   PetscInt           nonzerorow = 0;

1178:   PetscFunctionBegin;
1179:   PetscCall(VecCopy(yy, zz));
1180:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1181:   PetscCall(VecGetArrayRead(xx, &x));
1182:   x_ptr = x;
1183:   PetscCall(VecGetArray(zz, &z));
1184:   z_ptr = z;

1186:   aj = a->j;
1187:   v  = a->a;
1188:   ii = a->i;

1190:   if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1191:   work = a->mult_work;

1193:   for (i = 0; i < mbs; i++) {
1194:     n     = ii[1] - ii[0];
1195:     ncols = n * bs;
1196:     workt = work;
1197:     idx   = aj + ii[0];
1198:     nonzerorow += (n > 0);

1200:     /* upper triangular part */
1201:     for (j = 0; j < n; j++) {
1202:       xb = x_ptr + bs * (*idx++);
1203:       for (k = 0; k < bs; k++) workt[k] = xb[k];
1204:       workt += bs;
1205:     }
1206:     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1207:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);

1209:     /* strict lower triangular part */
1210:     idx = aj + ii[0];
1211:     if (n && *idx == i) {
1212:       ncols -= bs;
1213:       v += bs2;
1214:       idx++;
1215:       n--;
1216:     }
1217:     if (ncols > 0) {
1218:       workt = work;
1219:       PetscCall(PetscArrayzero(workt, ncols));
1220:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1221:       for (j = 0; j < n; j++) {
1222:         zb = z_ptr + bs * (*idx++);
1223:         for (k = 0; k < bs; k++) zb[k] += workt[k];
1224:         workt += bs;
1225:       }
1226:     }

1228:     x += bs;
1229:     v += n * bs2;
1230:     z += bs;
1231:     ii++;
1232:   }

1234:   PetscCall(VecRestoreArrayRead(xx, &x));
1235:   PetscCall(VecRestoreArray(zz, &z));

1237:   PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1238:   PetscFunctionReturn(PETSC_SUCCESS);
1239: }

1241: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1242: {
1243:   Mat_SeqSBAIJ *a      = (Mat_SeqSBAIJ *)inA->data;
1244:   PetscScalar   oalpha = alpha;
1245:   PetscBLASInt  one    = 1, totalnz;

1247:   PetscFunctionBegin;
1248:   PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1249:   PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1250:   PetscCall(PetscLogFlops(totalnz));
1251:   PetscFunctionReturn(PETSC_SUCCESS);
1252: }

1254: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1255: {
1256:   Mat_SeqSBAIJ    *a        = (Mat_SeqSBAIJ *)A->data;
1257:   const MatScalar *v        = a->a;
1258:   PetscReal        sum_diag = 0.0, sum_off = 0.0, *sum;
1259:   PetscInt         i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1260:   const PetscInt  *aj = a->j, *col;

1262:   PetscFunctionBegin;
1263:   if (!a->nz) {
1264:     *norm = 0.0;
1265:     PetscFunctionReturn(PETSC_SUCCESS);
1266:   }
1267:   if (type == NORM_FROBENIUS) {
1268:     for (k = 0; k < mbs; k++) {
1269:       jmin = a->i[k];
1270:       jmax = a->i[k + 1];
1271:       col  = aj + jmin;
1272:       if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1273:         for (i = 0; i < bs2; i++) {
1274:           sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1275:           v++;
1276:         }
1277:         jmin++;
1278:       }
1279:       for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1280:         for (i = 0; i < bs2; i++) {
1281:           sum_off += PetscRealPart(PetscConj(*v) * (*v));
1282:           v++;
1283:         }
1284:       }
1285:     }
1286:     *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1287:     PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1288:   } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1289:     PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1290:     for (i = 0; i < mbs; i++) jl[i] = mbs;
1291:     il[0] = 0;

1293:     *norm = 0.0;
1294:     for (k = 0; k < mbs; k++) { /* k_th block row */
1295:       for (j = 0; j < bs; j++) sum[j] = 0.0;
1296:       /*-- col sum --*/
1297:       i = jl[k]; /* first |A(i,k)| to be added */
1298:       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1299:                   at step k */
1300:       while (i < mbs) {
1301:         nexti = jl[i]; /* next block row to be added */
1302:         ik    = il[i]; /* block index of A(i,k) in the array a */
1303:         for (j = 0; j < bs; j++) {
1304:           v = a->a + ik * bs2 + j * bs;
1305:           for (k1 = 0; k1 < bs; k1++) {
1306:             sum[j] += PetscAbsScalar(*v);
1307:             v++;
1308:           }
1309:         }
1310:         /* update il, jl */
1311:         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1312:         jmax = a->i[i + 1];
1313:         if (jmin < jmax) {
1314:           il[i] = jmin;
1315:           j     = a->j[jmin];
1316:           jl[i] = jl[j];
1317:           jl[j] = i;
1318:         }
1319:         i = nexti;
1320:       }
1321:       /*-- row sum --*/
1322:       jmin = a->i[k];
1323:       jmax = a->i[k + 1];
1324:       for (i = jmin; i < jmax; i++) {
1325:         for (j = 0; j < bs; j++) {
1326:           v = a->a + i * bs2 + j;
1327:           for (k1 = 0; k1 < bs; k1++) {
1328:             sum[j] += PetscAbsScalar(*v);
1329:             v += bs;
1330:           }
1331:         }
1332:       }
1333:       /* add k_th block row to il, jl */
1334:       col = aj + jmin;
1335:       if (jmax - jmin > 0 && *col == k) jmin++;
1336:       if (jmin < jmax) {
1337:         il[k] = jmin;
1338:         j     = a->j[jmin];
1339:         jl[k] = jl[j];
1340:         jl[j] = k;
1341:       }
1342:       for (j = 0; j < bs; j++) {
1343:         if (sum[j] > *norm) *norm = sum[j];
1344:       }
1345:     }
1346:     PetscCall(PetscFree3(sum, il, jl));
1347:     PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1348:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1349:   PetscFunctionReturn(PETSC_SUCCESS);
1350: }

1352: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1353: {
1354:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;

1356:   PetscFunctionBegin;
1357:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1358:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1359:     *flg = PETSC_FALSE;
1360:     PetscFunctionReturn(PETSC_SUCCESS);
1361:   }

1363:   /* if the a->i are the same */
1364:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1365:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1367:   /* if a->j are the same */
1368:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1369:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

1371:   /* if a->a are the same */
1372:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1373:   PetscFunctionReturn(PETSC_SUCCESS);
1374: }

1376: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1377: {
1378:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1379:   PetscInt         i, j, k, row, bs, ambs, bs2;
1380:   const PetscInt  *ai, *aj;
1381:   PetscScalar     *x, zero = 0.0;
1382:   const MatScalar *aa, *aa_j;

1384:   PetscFunctionBegin;
1385:   bs = A->rmap->bs;
1386:   PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");

1388:   aa   = a->a;
1389:   ambs = a->mbs;

1391:   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1392:     PetscInt *diag = a->diag;
1393:     aa             = a->a;
1394:     ambs           = a->mbs;
1395:     PetscCall(VecGetArray(v, &x));
1396:     for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1397:     PetscCall(VecRestoreArray(v, &x));
1398:     PetscFunctionReturn(PETSC_SUCCESS);
1399:   }

1401:   ai  = a->i;
1402:   aj  = a->j;
1403:   bs2 = a->bs2;
1404:   PetscCall(VecSet(v, zero));
1405:   if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1406:   PetscCall(VecGetArray(v, &x));
1407:   for (i = 0; i < ambs; i++) {
1408:     j = ai[i];
1409:     if (aj[j] == i) { /* if this is a diagonal element */
1410:       row  = i * bs;
1411:       aa_j = aa + j * bs2;
1412:       for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1413:     }
1414:   }
1415:   PetscCall(VecRestoreArray(v, &x));
1416:   PetscFunctionReturn(PETSC_SUCCESS);
1417: }

1419: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1420: {
1421:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1422:   PetscScalar        x;
1423:   const PetscScalar *l, *li, *ri;
1424:   MatScalar         *aa, *v;
1425:   PetscInt           i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1426:   const PetscInt    *ai, *aj;
1427:   PetscBool          flg;

1429:   PetscFunctionBegin;
1430:   if (ll != rr) {
1431:     PetscCall(VecEqual(ll, rr, &flg));
1432:     PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1433:   }
1434:   if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1435:   ai  = a->i;
1436:   aj  = a->j;
1437:   aa  = a->a;
1438:   m   = A->rmap->N;
1439:   bs  = A->rmap->bs;
1440:   mbs = a->mbs;
1441:   bs2 = a->bs2;

1443:   PetscCall(VecGetArrayRead(ll, &l));
1444:   PetscCall(VecGetLocalSize(ll, &lm));
1445:   PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1446:   for (i = 0; i < mbs; i++) { /* for each block row */
1447:     M  = ai[i + 1] - ai[i];
1448:     li = l + i * bs;
1449:     v  = aa + bs2 * ai[i];
1450:     for (j = 0; j < M; j++) { /* for each block */
1451:       ri = l + bs * aj[ai[i] + j];
1452:       for (k = 0; k < bs; k++) {
1453:         x = ri[k];
1454:         for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1455:       }
1456:     }
1457:   }
1458:   PetscCall(VecRestoreArrayRead(ll, &l));
1459:   PetscCall(PetscLogFlops(2.0 * a->nz));
1460:   PetscFunctionReturn(PETSC_SUCCESS);
1461: }

1463: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1464: {
1465:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1467:   PetscFunctionBegin;
1468:   info->block_size   = a->bs2;
1469:   info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1470:   info->nz_used      = a->bs2 * a->nz;    /*num. of nonzeros in upper triangular part */
1471:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1472:   info->assemblies   = A->num_ass;
1473:   info->mallocs      = A->info.mallocs;
1474:   info->memory       = 0; /* REVIEW ME */
1475:   if (A->factortype) {
1476:     info->fill_ratio_given  = A->info.fill_ratio_given;
1477:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1478:     info->factor_mallocs    = A->info.factor_mallocs;
1479:   } else {
1480:     info->fill_ratio_given  = 0;
1481:     info->fill_ratio_needed = 0;
1482:     info->factor_mallocs    = 0;
1483:   }
1484:   PetscFunctionReturn(PETSC_SUCCESS);
1485: }

1487: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1488: {
1489:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;

1491:   PetscFunctionBegin;
1492:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1493:   PetscFunctionReturn(PETSC_SUCCESS);
1494: }

1496: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1497: {
1498:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1499:   PetscInt         i, j, n, row, col, bs, mbs;
1500:   const PetscInt  *ai, *aj;
1501:   PetscReal        atmp;
1502:   const MatScalar *aa;
1503:   PetscScalar     *x;
1504:   PetscInt         ncols, brow, bcol, krow, kcol;

1506:   PetscFunctionBegin;
1507:   PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1508:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1509:   bs  = A->rmap->bs;
1510:   aa  = a->a;
1511:   ai  = a->i;
1512:   aj  = a->j;
1513:   mbs = a->mbs;

1515:   PetscCall(VecSet(v, 0.0));
1516:   PetscCall(VecGetArray(v, &x));
1517:   PetscCall(VecGetLocalSize(v, &n));
1518:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1519:   for (i = 0; i < mbs; i++) {
1520:     ncols = ai[1] - ai[0];
1521:     ai++;
1522:     brow = bs * i;
1523:     for (j = 0; j < ncols; j++) {
1524:       bcol = bs * (*aj);
1525:       for (kcol = 0; kcol < bs; kcol++) {
1526:         col = bcol + kcol; /* col index */
1527:         for (krow = 0; krow < bs; krow++) {
1528:           atmp = PetscAbsScalar(*aa);
1529:           aa++;
1530:           row = brow + krow; /* row index */
1531:           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1532:           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1533:         }
1534:       }
1535:       aj++;
1536:     }
1537:   }
1538:   PetscCall(VecRestoreArray(v, &x));
1539:   PetscFunctionReturn(PETSC_SUCCESS);
1540: }

1542: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1543: {
1544:   PetscFunctionBegin;
1545:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1546:   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1547:   PetscFunctionReturn(PETSC_SUCCESS);
1548: }

1550: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1551: {
1552:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1553:   PetscScalar       *z = c;
1554:   const PetscScalar *xb;
1555:   PetscScalar        x1;
1556:   const MatScalar   *v   = a->a, *vv;
1557:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1558: #if defined(PETSC_USE_COMPLEX)
1559:   const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1560: #else
1561:   const int aconj = 0;
1562: #endif

1564:   PetscFunctionBegin;
1565:   for (i = 0; i < mbs; i++) {
1566:     n = ii[1] - ii[0];
1567:     ii++;
1568:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1569:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1570:     jj = idx;
1571:     vv = v;
1572:     for (k = 0; k < cn; k++) {
1573:       idx = jj;
1574:       v   = vv;
1575:       for (j = 0; j < n; j++) {
1576:         xb = b + (*idx);
1577:         x1 = xb[0 + k * bm];
1578:         z[0 + k * cm] += v[0] * x1;
1579:         if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1580:         v += 1;
1581:         ++idx;
1582:       }
1583:     }
1584:     z += 1;
1585:   }
1586:   PetscFunctionReturn(PETSC_SUCCESS);
1587: }

1589: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1590: {
1591:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1592:   PetscScalar       *z = c;
1593:   const PetscScalar *xb;
1594:   PetscScalar        x1, x2;
1595:   const MatScalar   *v   = a->a, *vv;
1596:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1598:   PetscFunctionBegin;
1599:   for (i = 0; i < mbs; i++) {
1600:     n = ii[1] - ii[0];
1601:     ii++;
1602:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1603:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1604:     jj = idx;
1605:     vv = v;
1606:     for (k = 0; k < cn; k++) {
1607:       idx = jj;
1608:       v   = vv;
1609:       for (j = 0; j < n; j++) {
1610:         xb = b + 2 * (*idx);
1611:         x1 = xb[0 + k * bm];
1612:         x2 = xb[1 + k * bm];
1613:         z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1614:         z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1615:         if (*idx != i) {
1616:           c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1617:           c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1618:         }
1619:         v += 4;
1620:         ++idx;
1621:       }
1622:     }
1623:     z += 2;
1624:   }
1625:   PetscFunctionReturn(PETSC_SUCCESS);
1626: }

1628: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1629: {
1630:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1631:   PetscScalar       *z = c;
1632:   const PetscScalar *xb;
1633:   PetscScalar        x1, x2, x3;
1634:   const MatScalar   *v   = a->a, *vv;
1635:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1637:   PetscFunctionBegin;
1638:   for (i = 0; i < mbs; i++) {
1639:     n = ii[1] - ii[0];
1640:     ii++;
1641:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
1642:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1643:     jj = idx;
1644:     vv = v;
1645:     for (k = 0; k < cn; k++) {
1646:       idx = jj;
1647:       v   = vv;
1648:       for (j = 0; j < n; j++) {
1649:         xb = b + 3 * (*idx);
1650:         x1 = xb[0 + k * bm];
1651:         x2 = xb[1 + k * bm];
1652:         x3 = xb[2 + k * bm];
1653:         z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1654:         z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1655:         z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1656:         if (*idx != i) {
1657:           c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1658:           c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1659:           c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1660:         }
1661:         v += 9;
1662:         ++idx;
1663:       }
1664:     }
1665:     z += 3;
1666:   }
1667:   PetscFunctionReturn(PETSC_SUCCESS);
1668: }

1670: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1671: {
1672:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1673:   PetscScalar       *z = c;
1674:   const PetscScalar *xb;
1675:   PetscScalar        x1, x2, x3, x4;
1676:   const MatScalar   *v   = a->a, *vv;
1677:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1679:   PetscFunctionBegin;
1680:   for (i = 0; i < mbs; i++) {
1681:     n = ii[1] - ii[0];
1682:     ii++;
1683:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1684:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1685:     jj = idx;
1686:     vv = v;
1687:     for (k = 0; k < cn; k++) {
1688:       idx = jj;
1689:       v   = vv;
1690:       for (j = 0; j < n; j++) {
1691:         xb = b + 4 * (*idx);
1692:         x1 = xb[0 + k * bm];
1693:         x2 = xb[1 + k * bm];
1694:         x3 = xb[2 + k * bm];
1695:         x4 = xb[3 + k * bm];
1696:         z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1697:         z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1698:         z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1699:         z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1700:         if (*idx != i) {
1701:           c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1702:           c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1703:           c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1704:           c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1705:         }
1706:         v += 16;
1707:         ++idx;
1708:       }
1709:     }
1710:     z += 4;
1711:   }
1712:   PetscFunctionReturn(PETSC_SUCCESS);
1713: }

1715: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1716: {
1717:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1718:   PetscScalar       *z = c;
1719:   const PetscScalar *xb;
1720:   PetscScalar        x1, x2, x3, x4, x5;
1721:   const MatScalar   *v   = a->a, *vv;
1722:   PetscInt           mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;

1724:   PetscFunctionBegin;
1725:   for (i = 0; i < mbs; i++) {
1726:     n = ii[1] - ii[0];
1727:     ii++;
1728:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
1729:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1730:     jj = idx;
1731:     vv = v;
1732:     for (k = 0; k < cn; k++) {
1733:       idx = jj;
1734:       v   = vv;
1735:       for (j = 0; j < n; j++) {
1736:         xb = b + 5 * (*idx);
1737:         x1 = xb[0 + k * bm];
1738:         x2 = xb[1 + k * bm];
1739:         x3 = xb[2 + k * bm];
1740:         x4 = xb[3 + k * bm];
1741:         x5 = xb[4 + k * cm];
1742:         z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1743:         z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1744:         z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1745:         z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1746:         z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1747:         if (*idx != i) {
1748:           c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1749:           c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1750:           c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1751:           c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1752:           c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1753:         }
1754:         v += 25;
1755:         ++idx;
1756:       }
1757:     }
1758:     z += 5;
1759:   }
1760:   PetscFunctionReturn(PETSC_SUCCESS);
1761: }

1763: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1764: {
1765:   Mat_SeqSBAIJ    *a  = (Mat_SeqSBAIJ *)A->data;
1766:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
1767:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
1768:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1769:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1770:   PetscBLASInt     bbs, bcn, bbm, bcm;
1771:   PetscScalar     *z = NULL;
1772:   PetscScalar     *c, *b;
1773:   const MatScalar *v;
1774:   const PetscInt  *idx, *ii;
1775:   PetscScalar      _DOne = 1.0;

1777:   PetscFunctionBegin;
1778:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1779:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1780:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1781:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1782:   b = bd->v;
1783:   PetscCall(MatZeroEntries(C));
1784:   PetscCall(MatDenseGetArray(C, &c));
1785:   switch (bs) {
1786:   case 1:
1787:     PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1788:     break;
1789:   case 2:
1790:     PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1791:     break;
1792:   case 3:
1793:     PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1794:     break;
1795:   case 4:
1796:     PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1797:     break;
1798:   case 5:
1799:     PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1800:     break;
1801:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1802:     PetscCall(PetscBLASIntCast(bs, &bbs));
1803:     PetscCall(PetscBLASIntCast(cn, &bcn));
1804:     PetscCall(PetscBLASIntCast(bm, &bbm));
1805:     PetscCall(PetscBLASIntCast(cm, &bcm));
1806:     idx = a->j;
1807:     v   = a->a;
1808:     mbs = a->mbs;
1809:     ii  = a->i;
1810:     z   = c;
1811:     for (i = 0; i < mbs; i++) {
1812:       n = ii[1] - ii[0];
1813:       ii++;
1814:       for (j = 0; j < n; j++) {
1815:         if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1816:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1817:         v += bs2;
1818:       }
1819:       z += bs;
1820:     }
1821:   }
1822:   PetscCall(MatDenseRestoreArray(C, &c));
1823:   PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1824:   PetscFunctionReturn(PETSC_SUCCESS);
1825: }