Actual source code: sbaij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
13: const PetscInt *idx;
14: PetscBT table_out, table_in;
16: PetscFunctionBegin;
17: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
18: mbs = a->mbs;
19: ai = a->i;
20: aj = a->j;
21: bs = A->rmap->bs;
22: PetscCall(PetscBTCreate(mbs, &table_out));
23: PetscCall(PetscMalloc1(mbs + 1, &nidx));
24: PetscCall(PetscBTCreate(mbs, &table_in));
26: for (i = 0; i < is_max; i++) { /* for each is */
27: isz = 0;
28: PetscCall(PetscBTMemzero(mbs, table_out));
30: /* Extract the indices, assume there can be duplicate entries */
31: PetscCall(ISGetIndices(is[i], &idx));
32: PetscCall(ISGetLocalSize(is[i], &n));
34: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35: bcol_max = 0;
36: for (j = 0; j < n; ++j) {
37: brow = idx[j] / bs; /* convert the indices into block indices */
38: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
39: if (!PetscBTLookupSet(table_out, brow)) {
40: nidx[isz++] = brow;
41: if (bcol_max < brow) bcol_max = brow;
42: }
43: }
44: PetscCall(ISRestoreIndices(is[i], &idx));
45: PetscCall(ISDestroy(&is[i]));
47: k = 0;
48: for (j = 0; j < ov; j++) { /* for each overlap */
49: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50: PetscCall(PetscBTMemzero(mbs, table_in));
51: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
53: n = isz; /* length of the updated is[i] */
54: for (brow = 0; brow < mbs; brow++) {
55: start = ai[brow];
56: end = ai[brow + 1];
57: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58: for (l = start; l < end; l++) {
59: bcol = aj[l];
60: if (!PetscBTLookupSet(table_out, bcol)) {
61: nidx[isz++] = bcol;
62: if (bcol_max < bcol) bcol_max = bcol;
63: }
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l < end; l++) {
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table_in, bcol)) {
72: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
79: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
80: } /* for each is */
81: PetscCall(PetscBTDestroy(&table_out));
82: PetscCall(PetscFree(nidx));
83: PetscCall(PetscBTDestroy(&table_in));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
88: Zero some ops' to avoid invalid usse */
89: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
90: {
91: PetscFunctionBegin;
92: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
93: Bseq->ops->mult = NULL;
94: Bseq->ops->multadd = NULL;
95: Bseq->ops->multtranspose = NULL;
96: Bseq->ops->multtransposeadd = NULL;
97: Bseq->ops->lufactor = NULL;
98: Bseq->ops->choleskyfactor = NULL;
99: Bseq->ops->lufactorsymbolic = NULL;
100: Bseq->ops->choleskyfactorsymbolic = NULL;
101: Bseq->ops->getinertia = NULL;
102: PetscFunctionReturn(PETSC_SUCCESS);
103: }
105: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
106: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
107: {
108: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
120: PetscCall(ISGetIndices(isrow, &irow));
121: PetscCall(ISGetIndices(iscol, &icol));
122: PetscCall(ISGetLocalSize(isrow, &nrows));
123: PetscCall(ISGetLocalSize(iscol, &ncols));
125: PetscCall(PetscCalloc1(1 + oldcols, &smap));
126: ssmap = smap;
127: PetscCall(PetscMalloc1(1 + nrows, &lens));
128: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
129: /* determine lens of each row */
130: for (i = 0; i < nrows; i++) {
131: kstart = ai[irow[i]];
132: kend = kstart + a->ilen[irow[i]];
133: lens[i] = 0;
134: for (k = kstart; k < kend; k++) {
135: if (ssmap[aj[k]]) lens[i]++;
136: }
137: }
138: /* Create and fill new matrix */
139: if (scall == MAT_REUSE_MATRIX) {
140: c = (Mat_SeqSBAIJ *)((*B)->data);
142: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong no of nonzeros");
145: PetscCall(PetscArrayzero(c->ilen, c->mbs));
146: C = *B;
147: } else {
148: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
149: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
150: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
151: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
152: }
153: c = (Mat_SeqSBAIJ *)(C->data);
154: for (i = 0; i < nrows; i++) {
155: row = irow[i];
156: kstart = ai[row];
157: kend = kstart + a->ilen[row];
158: mat_i = c->i[i];
159: mat_j = c->j + mat_i;
160: mat_a = c->a + mat_i * bs2;
161: mat_ilen = c->ilen + i;
162: for (k = kstart; k < kend; k++) {
163: if ((tcol = ssmap[a->j[k]])) {
164: *mat_j++ = tcol - 1;
165: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
166: mat_a += bs2;
167: (*mat_ilen)++;
168: }
169: }
170: }
171: /* sort */
172: {
173: MatScalar *work;
175: PetscCall(PetscMalloc1(bs2, &work));
176: for (i = 0; i < nrows; i++) {
177: PetscInt ilen;
178: mat_i = c->i[i];
179: mat_j = c->j + mat_i;
180: mat_a = c->a + mat_i * bs2;
181: ilen = c->ilen[i];
182: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
183: }
184: PetscCall(PetscFree(work));
185: }
187: /* Free work space */
188: PetscCall(ISRestoreIndices(iscol, &icol));
189: PetscCall(PetscFree(smap));
190: PetscCall(PetscFree(lens));
191: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
192: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
194: PetscCall(ISRestoreIndices(isrow, &irow));
195: *B = C;
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
200: {
201: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
202: IS is1, is2;
203: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
204: const PetscInt *irow, *icol;
206: PetscFunctionBegin;
207: PetscCall(ISGetIndices(isrow, &irow));
208: PetscCall(ISGetIndices(iscol, &icol));
209: PetscCall(ISGetLocalSize(isrow, &nrows));
210: PetscCall(ISGetLocalSize(iscol, &ncols));
212: /* Verify if the indices correspond to each element in a block
213: and form the IS with compressed IS */
214: maxmnbs = PetscMax(a->mbs, a->nbs);
215: PetscCall(PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary));
216: PetscCall(PetscArrayzero(vary, a->mbs));
217: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
218: for (i = 0; i < a->mbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Index set does not match blocks");
219: count = 0;
220: for (i = 0; i < nrows; i++) {
221: PetscInt j = irow[i] / bs;
222: if ((vary[j]--) == bs) iary[count++] = j;
223: }
224: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1));
226: PetscCall(PetscArrayzero(vary, a->nbs));
227: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
228: for (i = 0; i < a->nbs; i++) PetscCheck(vary[i] == 0 || vary[i] == bs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal error in PETSc");
229: count = 0;
230: for (i = 0; i < ncols; i++) {
231: PetscInt j = icol[i] / bs;
232: if ((vary[j]--) == bs) iary[count++] = j;
233: }
234: PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2));
235: PetscCall(ISRestoreIndices(isrow, &irow));
236: PetscCall(ISRestoreIndices(iscol, &icol));
237: PetscCall(PetscFree2(vary, iary));
239: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B));
240: PetscCall(ISDestroy(&is1));
241: PetscCall(ISDestroy(&is2));
243: if (isrow != iscol) {
244: PetscBool isequal;
245: PetscCall(ISEqual(isrow, iscol, &isequal));
246: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
247: }
248: PetscFunctionReturn(PETSC_SUCCESS);
249: }
251: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
252: {
253: PetscInt i;
255: PetscFunctionBegin;
256: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));
258: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
259: PetscFunctionReturn(PETSC_SUCCESS);
260: }
262: /* Should check that shapes of vectors and matrices match */
263: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
264: {
265: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
266: PetscScalar *z, x1, x2, zero = 0.0;
267: const PetscScalar *x, *xb;
268: const MatScalar *v;
269: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
270: const PetscInt *aj = a->j, *ai = a->i, *ib;
271: PetscInt nonzerorow = 0;
273: PetscFunctionBegin;
274: PetscCall(VecSet(zz, zero));
275: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
276: PetscCall(VecGetArrayRead(xx, &x));
277: PetscCall(VecGetArray(zz, &z));
279: v = a->a;
280: xb = x;
282: for (i = 0; i < mbs; i++) {
283: n = ai[1] - ai[0]; /* length of i_th block row of A */
284: x1 = xb[0];
285: x2 = xb[1];
286: ib = aj + *ai;
287: jmin = 0;
288: nonzerorow += (n > 0);
289: if (*ib == i) { /* (diag of A)*x */
290: z[2 * i] += v[0] * x1 + v[2] * x2;
291: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
292: v += 4;
293: jmin++;
294: }
295: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
296: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
297: for (j = jmin; j < n; j++) {
298: /* (strict lower triangular part of A)*x */
299: cval = ib[j] * 2;
300: z[cval] += v[0] * x1 + v[1] * x2;
301: z[cval + 1] += v[2] * x1 + v[3] * x2;
302: /* (strict upper triangular part of A)*x */
303: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
304: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
305: v += 4;
306: }
307: xb += 2;
308: ai++;
309: }
311: PetscCall(VecRestoreArrayRead(xx, &x));
312: PetscCall(VecRestoreArray(zz, &z));
313: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
314: PetscFunctionReturn(PETSC_SUCCESS);
315: }
317: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
318: {
319: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
320: PetscScalar *z, x1, x2, x3, zero = 0.0;
321: const PetscScalar *x, *xb;
322: const MatScalar *v;
323: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
324: const PetscInt *aj = a->j, *ai = a->i, *ib;
325: PetscInt nonzerorow = 0;
327: PetscFunctionBegin;
328: PetscCall(VecSet(zz, zero));
329: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
330: PetscCall(VecGetArrayRead(xx, &x));
331: PetscCall(VecGetArray(zz, &z));
333: v = a->a;
334: xb = x;
336: for (i = 0; i < mbs; i++) {
337: n = ai[1] - ai[0]; /* length of i_th block row of A */
338: x1 = xb[0];
339: x2 = xb[1];
340: x3 = xb[2];
341: ib = aj + *ai;
342: jmin = 0;
343: nonzerorow += (n > 0);
344: if (*ib == i) { /* (diag of A)*x */
345: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
346: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
347: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
348: v += 9;
349: jmin++;
350: }
351: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
352: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
353: for (j = jmin; j < n; j++) {
354: /* (strict lower triangular part of A)*x */
355: cval = ib[j] * 3;
356: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
357: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
358: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
359: /* (strict upper triangular part of A)*x */
360: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
361: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
362: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
363: v += 9;
364: }
365: xb += 3;
366: ai++;
367: }
369: PetscCall(VecRestoreArrayRead(xx, &x));
370: PetscCall(VecRestoreArray(zz, &z));
371: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
372: PetscFunctionReturn(PETSC_SUCCESS);
373: }
375: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
376: {
377: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
378: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
379: const PetscScalar *x, *xb;
380: const MatScalar *v;
381: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
382: const PetscInt *aj = a->j, *ai = a->i, *ib;
383: PetscInt nonzerorow = 0;
385: PetscFunctionBegin;
386: PetscCall(VecSet(zz, zero));
387: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
388: PetscCall(VecGetArrayRead(xx, &x));
389: PetscCall(VecGetArray(zz, &z));
391: v = a->a;
392: xb = x;
394: for (i = 0; i < mbs; i++) {
395: n = ai[1] - ai[0]; /* length of i_th block row of A */
396: x1 = xb[0];
397: x2 = xb[1];
398: x3 = xb[2];
399: x4 = xb[3];
400: ib = aj + *ai;
401: jmin = 0;
402: nonzerorow += (n > 0);
403: if (*ib == i) { /* (diag of A)*x */
404: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
405: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
406: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
407: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
408: v += 16;
409: jmin++;
410: }
411: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
412: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
413: for (j = jmin; j < n; j++) {
414: /* (strict lower triangular part of A)*x */
415: cval = ib[j] * 4;
416: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
417: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
418: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
419: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
420: /* (strict upper triangular part of A)*x */
421: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
422: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
423: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
424: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
425: v += 16;
426: }
427: xb += 4;
428: ai++;
429: }
431: PetscCall(VecRestoreArrayRead(xx, &x));
432: PetscCall(VecRestoreArray(zz, &z));
433: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
434: PetscFunctionReturn(PETSC_SUCCESS);
435: }
437: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
438: {
439: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
440: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
441: const PetscScalar *x, *xb;
442: const MatScalar *v;
443: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
444: const PetscInt *aj = a->j, *ai = a->i, *ib;
445: PetscInt nonzerorow = 0;
447: PetscFunctionBegin;
448: PetscCall(VecSet(zz, zero));
449: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
450: PetscCall(VecGetArrayRead(xx, &x));
451: PetscCall(VecGetArray(zz, &z));
453: v = a->a;
454: xb = x;
456: for (i = 0; i < mbs; i++) {
457: n = ai[1] - ai[0]; /* length of i_th block row of A */
458: x1 = xb[0];
459: x2 = xb[1];
460: x3 = xb[2];
461: x4 = xb[3];
462: x5 = xb[4];
463: ib = aj + *ai;
464: jmin = 0;
465: nonzerorow += (n > 0);
466: if (*ib == i) { /* (diag of A)*x */
467: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
468: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
469: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
470: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
471: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
472: v += 25;
473: jmin++;
474: }
475: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
476: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
477: for (j = jmin; j < n; j++) {
478: /* (strict lower triangular part of A)*x */
479: cval = ib[j] * 5;
480: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
481: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
482: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
483: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
484: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
485: /* (strict upper triangular part of A)*x */
486: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
487: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
488: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
489: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
490: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
491: v += 25;
492: }
493: xb += 5;
494: ai++;
495: }
497: PetscCall(VecRestoreArrayRead(xx, &x));
498: PetscCall(VecRestoreArray(zz, &z));
499: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
500: PetscFunctionReturn(PETSC_SUCCESS);
501: }
503: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
504: {
505: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
506: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
507: const PetscScalar *x, *xb;
508: const MatScalar *v;
509: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
510: const PetscInt *aj = a->j, *ai = a->i, *ib;
511: PetscInt nonzerorow = 0;
513: PetscFunctionBegin;
514: PetscCall(VecSet(zz, zero));
515: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
516: PetscCall(VecGetArrayRead(xx, &x));
517: PetscCall(VecGetArray(zz, &z));
519: v = a->a;
520: xb = x;
522: for (i = 0; i < mbs; i++) {
523: n = ai[1] - ai[0]; /* length of i_th block row of A */
524: x1 = xb[0];
525: x2 = xb[1];
526: x3 = xb[2];
527: x4 = xb[3];
528: x5 = xb[4];
529: x6 = xb[5];
530: ib = aj + *ai;
531: jmin = 0;
532: nonzerorow += (n > 0);
533: if (*ib == i) { /* (diag of A)*x */
534: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540: v += 36;
541: jmin++;
542: }
543: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
544: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545: for (j = jmin; j < n; j++) {
546: /* (strict lower triangular part of A)*x */
547: cval = ib[j] * 6;
548: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554: /* (strict upper triangular part of A)*x */
555: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561: v += 36;
562: }
563: xb += 6;
564: ai++;
565: }
567: PetscCall(VecRestoreArrayRead(xx, &x));
568: PetscCall(VecRestoreArray(zz, &z));
569: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
570: PetscFunctionReturn(PETSC_SUCCESS);
571: }
573: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
574: {
575: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
576: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
577: const PetscScalar *x, *xb;
578: const MatScalar *v;
579: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
580: const PetscInt *aj = a->j, *ai = a->i, *ib;
581: PetscInt nonzerorow = 0;
583: PetscFunctionBegin;
584: PetscCall(VecSet(zz, zero));
585: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
586: PetscCall(VecGetArrayRead(xx, &x));
587: PetscCall(VecGetArray(zz, &z));
589: v = a->a;
590: xb = x;
592: for (i = 0; i < mbs; i++) {
593: n = ai[1] - ai[0]; /* length of i_th block row of A */
594: x1 = xb[0];
595: x2 = xb[1];
596: x3 = xb[2];
597: x4 = xb[3];
598: x5 = xb[4];
599: x6 = xb[5];
600: x7 = xb[6];
601: ib = aj + *ai;
602: jmin = 0;
603: nonzerorow += (n > 0);
604: if (*ib == i) { /* (diag of A)*x */
605: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
606: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
607: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
608: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
609: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
610: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
611: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
612: v += 49;
613: jmin++;
614: }
615: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
616: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
617: for (j = jmin; j < n; j++) {
618: /* (strict lower triangular part of A)*x */
619: cval = ib[j] * 7;
620: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
621: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
622: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
623: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
624: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
625: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
626: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
627: /* (strict upper triangular part of A)*x */
628: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
629: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
630: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
631: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
632: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
633: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
634: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
635: v += 49;
636: }
637: xb += 7;
638: ai++;
639: }
640: PetscCall(VecRestoreArrayRead(xx, &x));
641: PetscCall(VecRestoreArray(zz, &z));
642: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
643: PetscFunctionReturn(PETSC_SUCCESS);
644: }
646: /*
647: This will not work with MatScalar == float because it calls the BLAS
648: */
649: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
650: {
651: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
652: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
653: const PetscScalar *x, *x_ptr, *xb;
654: const MatScalar *v;
655: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
656: const PetscInt *idx, *aj, *ii;
657: PetscInt nonzerorow = 0;
659: PetscFunctionBegin;
660: PetscCall(VecSet(zz, zero));
661: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
662: PetscCall(VecGetArrayRead(xx, &x));
663: PetscCall(VecGetArray(zz, &z));
665: x_ptr = x;
666: z_ptr = z;
668: aj = a->j;
669: v = a->a;
670: ii = a->i;
672: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
673: work = a->mult_work;
675: for (i = 0; i < mbs; i++) {
676: n = ii[1] - ii[0];
677: ncols = n * bs;
678: workt = work;
679: idx = aj + ii[0];
680: nonzerorow += (n > 0);
682: /* upper triangular part */
683: for (j = 0; j < n; j++) {
684: xb = x_ptr + bs * (*idx++);
685: for (k = 0; k < bs; k++) workt[k] = xb[k];
686: workt += bs;
687: }
688: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
689: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
691: /* strict lower triangular part */
692: idx = aj + ii[0];
693: if (n && *idx == i) {
694: ncols -= bs;
695: v += bs2;
696: idx++;
697: n--;
698: }
700: if (ncols > 0) {
701: workt = work;
702: PetscCall(PetscArrayzero(workt, ncols));
703: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
704: for (j = 0; j < n; j++) {
705: zb = z_ptr + bs * (*idx++);
706: for (k = 0; k < bs; k++) zb[k] += workt[k];
707: workt += bs;
708: }
709: }
710: x += bs;
711: v += n * bs2;
712: z += bs;
713: ii++;
714: }
716: PetscCall(VecRestoreArrayRead(xx, &x));
717: PetscCall(VecRestoreArray(zz, &z));
718: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
719: PetscFunctionReturn(PETSC_SUCCESS);
720: }
722: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
723: {
724: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
725: PetscScalar *z, x1;
726: const PetscScalar *x, *xb;
727: const MatScalar *v;
728: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
729: const PetscInt *aj = a->j, *ai = a->i, *ib;
730: PetscInt nonzerorow = 0;
731: #if defined(PETSC_USE_COMPLEX)
732: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
733: #else
734: const int aconj = 0;
735: #endif
737: PetscFunctionBegin;
738: PetscCall(VecCopy(yy, zz));
739: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
740: PetscCall(VecGetArrayRead(xx, &x));
741: PetscCall(VecGetArray(zz, &z));
742: v = a->a;
743: xb = x;
745: for (i = 0; i < mbs; i++) {
746: n = ai[1] - ai[0]; /* length of i_th row of A */
747: x1 = xb[0];
748: ib = aj + *ai;
749: jmin = 0;
750: nonzerorow += (n > 0);
751: if (n && *ib == i) { /* (diag of A)*x */
752: z[i] += *v++ * x[*ib++];
753: jmin++;
754: }
755: if (aconj) {
756: for (j = jmin; j < n; j++) {
757: cval = *ib;
758: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
759: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
760: }
761: } else {
762: for (j = jmin; j < n; j++) {
763: cval = *ib;
764: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
765: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
766: }
767: }
768: xb++;
769: ai++;
770: }
772: PetscCall(VecRestoreArrayRead(xx, &x));
773: PetscCall(VecRestoreArray(zz, &z));
775: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
776: PetscFunctionReturn(PETSC_SUCCESS);
777: }
779: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
780: {
781: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
782: PetscScalar *z, x1, x2;
783: const PetscScalar *x, *xb;
784: const MatScalar *v;
785: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
786: const PetscInt *aj = a->j, *ai = a->i, *ib;
787: PetscInt nonzerorow = 0;
789: PetscFunctionBegin;
790: PetscCall(VecCopy(yy, zz));
791: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
792: PetscCall(VecGetArrayRead(xx, &x));
793: PetscCall(VecGetArray(zz, &z));
795: v = a->a;
796: xb = x;
798: for (i = 0; i < mbs; i++) {
799: n = ai[1] - ai[0]; /* length of i_th block row of A */
800: x1 = xb[0];
801: x2 = xb[1];
802: ib = aj + *ai;
803: jmin = 0;
804: nonzerorow += (n > 0);
805: if (n && *ib == i) { /* (diag of A)*x */
806: z[2 * i] += v[0] * x1 + v[2] * x2;
807: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
808: v += 4;
809: jmin++;
810: }
811: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
812: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
813: for (j = jmin; j < n; j++) {
814: /* (strict lower triangular part of A)*x */
815: cval = ib[j] * 2;
816: z[cval] += v[0] * x1 + v[1] * x2;
817: z[cval + 1] += v[2] * x1 + v[3] * x2;
818: /* (strict upper triangular part of A)*x */
819: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
820: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
821: v += 4;
822: }
823: xb += 2;
824: ai++;
825: }
826: PetscCall(VecRestoreArrayRead(xx, &x));
827: PetscCall(VecRestoreArray(zz, &z));
829: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
830: PetscFunctionReturn(PETSC_SUCCESS);
831: }
833: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
834: {
835: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
836: PetscScalar *z, x1, x2, x3;
837: const PetscScalar *x, *xb;
838: const MatScalar *v;
839: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
840: const PetscInt *aj = a->j, *ai = a->i, *ib;
841: PetscInt nonzerorow = 0;
843: PetscFunctionBegin;
844: PetscCall(VecCopy(yy, zz));
845: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
846: PetscCall(VecGetArrayRead(xx, &x));
847: PetscCall(VecGetArray(zz, &z));
849: v = a->a;
850: xb = x;
852: for (i = 0; i < mbs; i++) {
853: n = ai[1] - ai[0]; /* length of i_th block row of A */
854: x1 = xb[0];
855: x2 = xb[1];
856: x3 = xb[2];
857: ib = aj + *ai;
858: jmin = 0;
859: nonzerorow += (n > 0);
860: if (n && *ib == i) { /* (diag of A)*x */
861: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
862: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
863: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
864: v += 9;
865: jmin++;
866: }
867: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
868: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
869: for (j = jmin; j < n; j++) {
870: /* (strict lower triangular part of A)*x */
871: cval = ib[j] * 3;
872: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
873: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
874: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
875: /* (strict upper triangular part of A)*x */
876: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
877: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
878: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
879: v += 9;
880: }
881: xb += 3;
882: ai++;
883: }
885: PetscCall(VecRestoreArrayRead(xx, &x));
886: PetscCall(VecRestoreArray(zz, &z));
888: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
889: PetscFunctionReturn(PETSC_SUCCESS);
890: }
892: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
893: {
894: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
895: PetscScalar *z, x1, x2, x3, x4;
896: const PetscScalar *x, *xb;
897: const MatScalar *v;
898: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
899: const PetscInt *aj = a->j, *ai = a->i, *ib;
900: PetscInt nonzerorow = 0;
902: PetscFunctionBegin;
903: PetscCall(VecCopy(yy, zz));
904: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
905: PetscCall(VecGetArrayRead(xx, &x));
906: PetscCall(VecGetArray(zz, &z));
908: v = a->a;
909: xb = x;
911: for (i = 0; i < mbs; i++) {
912: n = ai[1] - ai[0]; /* length of i_th block row of A */
913: x1 = xb[0];
914: x2 = xb[1];
915: x3 = xb[2];
916: x4 = xb[3];
917: ib = aj + *ai;
918: jmin = 0;
919: nonzerorow += (n > 0);
920: if (n && *ib == i) { /* (diag of A)*x */
921: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
922: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
923: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
924: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
925: v += 16;
926: jmin++;
927: }
928: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
929: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930: for (j = jmin; j < n; j++) {
931: /* (strict lower triangular part of A)*x */
932: cval = ib[j] * 4;
933: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
934: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
935: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
936: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
937: /* (strict upper triangular part of A)*x */
938: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
939: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
940: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
941: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
942: v += 16;
943: }
944: xb += 4;
945: ai++;
946: }
948: PetscCall(VecRestoreArrayRead(xx, &x));
949: PetscCall(VecRestoreArray(zz, &z));
951: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
952: PetscFunctionReturn(PETSC_SUCCESS);
953: }
955: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
956: {
957: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
958: PetscScalar *z, x1, x2, x3, x4, x5;
959: const PetscScalar *x, *xb;
960: const MatScalar *v;
961: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
962: const PetscInt *aj = a->j, *ai = a->i, *ib;
963: PetscInt nonzerorow = 0;
965: PetscFunctionBegin;
966: PetscCall(VecCopy(yy, zz));
967: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
968: PetscCall(VecGetArrayRead(xx, &x));
969: PetscCall(VecGetArray(zz, &z));
971: v = a->a;
972: xb = x;
974: for (i = 0; i < mbs; i++) {
975: n = ai[1] - ai[0]; /* length of i_th block row of A */
976: x1 = xb[0];
977: x2 = xb[1];
978: x3 = xb[2];
979: x4 = xb[3];
980: x5 = xb[4];
981: ib = aj + *ai;
982: jmin = 0;
983: nonzerorow += (n > 0);
984: if (n && *ib == i) { /* (diag of A)*x */
985: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
986: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
987: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
988: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
989: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
990: v += 25;
991: jmin++;
992: }
993: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
994: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
995: for (j = jmin; j < n; j++) {
996: /* (strict lower triangular part of A)*x */
997: cval = ib[j] * 5;
998: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
999: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1000: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1001: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1002: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1003: /* (strict upper triangular part of A)*x */
1004: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1005: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1006: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1007: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1008: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1009: v += 25;
1010: }
1011: xb += 5;
1012: ai++;
1013: }
1015: PetscCall(VecRestoreArrayRead(xx, &x));
1016: PetscCall(VecRestoreArray(zz, &z));
1018: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1019: PetscFunctionReturn(PETSC_SUCCESS);
1020: }
1022: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1023: {
1024: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1025: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1026: const PetscScalar *x, *xb;
1027: const MatScalar *v;
1028: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1029: const PetscInt *aj = a->j, *ai = a->i, *ib;
1030: PetscInt nonzerorow = 0;
1032: PetscFunctionBegin;
1033: PetscCall(VecCopy(yy, zz));
1034: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1035: PetscCall(VecGetArrayRead(xx, &x));
1036: PetscCall(VecGetArray(zz, &z));
1038: v = a->a;
1039: xb = x;
1041: for (i = 0; i < mbs; i++) {
1042: n = ai[1] - ai[0]; /* length of i_th block row of A */
1043: x1 = xb[0];
1044: x2 = xb[1];
1045: x3 = xb[2];
1046: x4 = xb[3];
1047: x5 = xb[4];
1048: x6 = xb[5];
1049: ib = aj + *ai;
1050: jmin = 0;
1051: nonzerorow += (n > 0);
1052: if (n && *ib == i) { /* (diag of A)*x */
1053: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1054: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1055: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1056: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1057: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1058: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1059: v += 36;
1060: jmin++;
1061: }
1062: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1063: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1064: for (j = jmin; j < n; j++) {
1065: /* (strict lower triangular part of A)*x */
1066: cval = ib[j] * 6;
1067: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1068: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1069: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1070: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1071: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1072: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1073: /* (strict upper triangular part of A)*x */
1074: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1075: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1076: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1077: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1078: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1079: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1080: v += 36;
1081: }
1082: xb += 6;
1083: ai++;
1084: }
1086: PetscCall(VecRestoreArrayRead(xx, &x));
1087: PetscCall(VecRestoreArray(zz, &z));
1089: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1090: PetscFunctionReturn(PETSC_SUCCESS);
1091: }
1093: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1094: {
1095: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1096: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1097: const PetscScalar *x, *xb;
1098: const MatScalar *v;
1099: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1100: const PetscInt *aj = a->j, *ai = a->i, *ib;
1101: PetscInt nonzerorow = 0;
1103: PetscFunctionBegin;
1104: PetscCall(VecCopy(yy, zz));
1105: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1106: PetscCall(VecGetArrayRead(xx, &x));
1107: PetscCall(VecGetArray(zz, &z));
1109: v = a->a;
1110: xb = x;
1112: for (i = 0; i < mbs; i++) {
1113: n = ai[1] - ai[0]; /* length of i_th block row of A */
1114: x1 = xb[0];
1115: x2 = xb[1];
1116: x3 = xb[2];
1117: x4 = xb[3];
1118: x5 = xb[4];
1119: x6 = xb[5];
1120: x7 = xb[6];
1121: ib = aj + *ai;
1122: jmin = 0;
1123: nonzerorow += (n > 0);
1124: if (n && *ib == i) { /* (diag of A)*x */
1125: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1126: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1127: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1128: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1129: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1130: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1131: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1132: v += 49;
1133: jmin++;
1134: }
1135: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1136: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1137: for (j = jmin; j < n; j++) {
1138: /* (strict lower triangular part of A)*x */
1139: cval = ib[j] * 7;
1140: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1141: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1142: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1143: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1144: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1145: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1146: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1147: /* (strict upper triangular part of A)*x */
1148: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1149: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1150: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1151: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1152: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1153: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1154: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1155: v += 49;
1156: }
1157: xb += 7;
1158: ai++;
1159: }
1161: PetscCall(VecRestoreArrayRead(xx, &x));
1162: PetscCall(VecRestoreArray(zz, &z));
1164: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1165: PetscFunctionReturn(PETSC_SUCCESS);
1166: }
1168: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1169: {
1170: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1171: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1172: const PetscScalar *x, *x_ptr, *xb;
1173: const MatScalar *v;
1174: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1175: const PetscInt *idx, *aj, *ii;
1176: PetscInt nonzerorow = 0;
1178: PetscFunctionBegin;
1179: PetscCall(VecCopy(yy, zz));
1180: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1181: PetscCall(VecGetArrayRead(xx, &x));
1182: x_ptr = x;
1183: PetscCall(VecGetArray(zz, &z));
1184: z_ptr = z;
1186: aj = a->j;
1187: v = a->a;
1188: ii = a->i;
1190: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1191: work = a->mult_work;
1193: for (i = 0; i < mbs; i++) {
1194: n = ii[1] - ii[0];
1195: ncols = n * bs;
1196: workt = work;
1197: idx = aj + ii[0];
1198: nonzerorow += (n > 0);
1200: /* upper triangular part */
1201: for (j = 0; j < n; j++) {
1202: xb = x_ptr + bs * (*idx++);
1203: for (k = 0; k < bs; k++) workt[k] = xb[k];
1204: workt += bs;
1205: }
1206: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1207: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1209: /* strict lower triangular part */
1210: idx = aj + ii[0];
1211: if (n && *idx == i) {
1212: ncols -= bs;
1213: v += bs2;
1214: idx++;
1215: n--;
1216: }
1217: if (ncols > 0) {
1218: workt = work;
1219: PetscCall(PetscArrayzero(workt, ncols));
1220: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1221: for (j = 0; j < n; j++) {
1222: zb = z_ptr + bs * (*idx++);
1223: for (k = 0; k < bs; k++) zb[k] += workt[k];
1224: workt += bs;
1225: }
1226: }
1228: x += bs;
1229: v += n * bs2;
1230: z += bs;
1231: ii++;
1232: }
1234: PetscCall(VecRestoreArrayRead(xx, &x));
1235: PetscCall(VecRestoreArray(zz, &z));
1237: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1238: PetscFunctionReturn(PETSC_SUCCESS);
1239: }
1241: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1242: {
1243: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1244: PetscScalar oalpha = alpha;
1245: PetscBLASInt one = 1, totalnz;
1247: PetscFunctionBegin;
1248: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1249: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1250: PetscCall(PetscLogFlops(totalnz));
1251: PetscFunctionReturn(PETSC_SUCCESS);
1252: }
1254: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1255: {
1256: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1257: const MatScalar *v = a->a;
1258: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1259: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1260: const PetscInt *aj = a->j, *col;
1262: PetscFunctionBegin;
1263: if (!a->nz) {
1264: *norm = 0.0;
1265: PetscFunctionReturn(PETSC_SUCCESS);
1266: }
1267: if (type == NORM_FROBENIUS) {
1268: for (k = 0; k < mbs; k++) {
1269: jmin = a->i[k];
1270: jmax = a->i[k + 1];
1271: col = aj + jmin;
1272: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1273: for (i = 0; i < bs2; i++) {
1274: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1275: v++;
1276: }
1277: jmin++;
1278: }
1279: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1280: for (i = 0; i < bs2; i++) {
1281: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1282: v++;
1283: }
1284: }
1285: }
1286: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1287: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1288: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1289: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1290: for (i = 0; i < mbs; i++) jl[i] = mbs;
1291: il[0] = 0;
1293: *norm = 0.0;
1294: for (k = 0; k < mbs; k++) { /* k_th block row */
1295: for (j = 0; j < bs; j++) sum[j] = 0.0;
1296: /*-- col sum --*/
1297: i = jl[k]; /* first |A(i,k)| to be added */
1298: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1299: at step k */
1300: while (i < mbs) {
1301: nexti = jl[i]; /* next block row to be added */
1302: ik = il[i]; /* block index of A(i,k) in the array a */
1303: for (j = 0; j < bs; j++) {
1304: v = a->a + ik * bs2 + j * bs;
1305: for (k1 = 0; k1 < bs; k1++) {
1306: sum[j] += PetscAbsScalar(*v);
1307: v++;
1308: }
1309: }
1310: /* update il, jl */
1311: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1312: jmax = a->i[i + 1];
1313: if (jmin < jmax) {
1314: il[i] = jmin;
1315: j = a->j[jmin];
1316: jl[i] = jl[j];
1317: jl[j] = i;
1318: }
1319: i = nexti;
1320: }
1321: /*-- row sum --*/
1322: jmin = a->i[k];
1323: jmax = a->i[k + 1];
1324: for (i = jmin; i < jmax; i++) {
1325: for (j = 0; j < bs; j++) {
1326: v = a->a + i * bs2 + j;
1327: for (k1 = 0; k1 < bs; k1++) {
1328: sum[j] += PetscAbsScalar(*v);
1329: v += bs;
1330: }
1331: }
1332: }
1333: /* add k_th block row to il, jl */
1334: col = aj + jmin;
1335: if (jmax - jmin > 0 && *col == k) jmin++;
1336: if (jmin < jmax) {
1337: il[k] = jmin;
1338: j = a->j[jmin];
1339: jl[k] = jl[j];
1340: jl[j] = k;
1341: }
1342: for (j = 0; j < bs; j++) {
1343: if (sum[j] > *norm) *norm = sum[j];
1344: }
1345: }
1346: PetscCall(PetscFree3(sum, il, jl));
1347: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1348: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1349: PetscFunctionReturn(PETSC_SUCCESS);
1350: }
1352: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1353: {
1354: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1356: PetscFunctionBegin;
1357: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1358: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1359: *flg = PETSC_FALSE;
1360: PetscFunctionReturn(PETSC_SUCCESS);
1361: }
1363: /* if the a->i are the same */
1364: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1365: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1367: /* if a->j are the same */
1368: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1369: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1371: /* if a->a are the same */
1372: PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg));
1373: PetscFunctionReturn(PETSC_SUCCESS);
1374: }
1376: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1377: {
1378: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1379: PetscInt i, j, k, row, bs, ambs, bs2;
1380: const PetscInt *ai, *aj;
1381: PetscScalar *x, zero = 0.0;
1382: const MatScalar *aa, *aa_j;
1384: PetscFunctionBegin;
1385: bs = A->rmap->bs;
1386: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1388: aa = a->a;
1389: ambs = a->mbs;
1391: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1392: PetscInt *diag = a->diag;
1393: aa = a->a;
1394: ambs = a->mbs;
1395: PetscCall(VecGetArray(v, &x));
1396: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1397: PetscCall(VecRestoreArray(v, &x));
1398: PetscFunctionReturn(PETSC_SUCCESS);
1399: }
1401: ai = a->i;
1402: aj = a->j;
1403: bs2 = a->bs2;
1404: PetscCall(VecSet(v, zero));
1405: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1406: PetscCall(VecGetArray(v, &x));
1407: for (i = 0; i < ambs; i++) {
1408: j = ai[i];
1409: if (aj[j] == i) { /* if this is a diagonal element */
1410: row = i * bs;
1411: aa_j = aa + j * bs2;
1412: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1413: }
1414: }
1415: PetscCall(VecRestoreArray(v, &x));
1416: PetscFunctionReturn(PETSC_SUCCESS);
1417: }
1419: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1420: {
1421: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1422: PetscScalar x;
1423: const PetscScalar *l, *li, *ri;
1424: MatScalar *aa, *v;
1425: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1426: const PetscInt *ai, *aj;
1427: PetscBool flg;
1429: PetscFunctionBegin;
1430: if (ll != rr) {
1431: PetscCall(VecEqual(ll, rr, &flg));
1432: PetscCheck(flg, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "For symmetric format, left and right scaling vectors must be same");
1433: }
1434: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1435: ai = a->i;
1436: aj = a->j;
1437: aa = a->a;
1438: m = A->rmap->N;
1439: bs = A->rmap->bs;
1440: mbs = a->mbs;
1441: bs2 = a->bs2;
1443: PetscCall(VecGetArrayRead(ll, &l));
1444: PetscCall(VecGetLocalSize(ll, &lm));
1445: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1446: for (i = 0; i < mbs; i++) { /* for each block row */
1447: M = ai[i + 1] - ai[i];
1448: li = l + i * bs;
1449: v = aa + bs2 * ai[i];
1450: for (j = 0; j < M; j++) { /* for each block */
1451: ri = l + bs * aj[ai[i] + j];
1452: for (k = 0; k < bs; k++) {
1453: x = ri[k];
1454: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1455: }
1456: }
1457: }
1458: PetscCall(VecRestoreArrayRead(ll, &l));
1459: PetscCall(PetscLogFlops(2.0 * a->nz));
1460: PetscFunctionReturn(PETSC_SUCCESS);
1461: }
1463: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1464: {
1465: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1467: PetscFunctionBegin;
1468: info->block_size = a->bs2;
1469: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1470: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1471: info->nz_unneeded = info->nz_allocated - info->nz_used;
1472: info->assemblies = A->num_ass;
1473: info->mallocs = A->info.mallocs;
1474: info->memory = 0; /* REVIEW ME */
1475: if (A->factortype) {
1476: info->fill_ratio_given = A->info.fill_ratio_given;
1477: info->fill_ratio_needed = A->info.fill_ratio_needed;
1478: info->factor_mallocs = A->info.factor_mallocs;
1479: } else {
1480: info->fill_ratio_given = 0;
1481: info->fill_ratio_needed = 0;
1482: info->factor_mallocs = 0;
1483: }
1484: PetscFunctionReturn(PETSC_SUCCESS);
1485: }
1487: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1488: {
1489: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1491: PetscFunctionBegin;
1492: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1493: PetscFunctionReturn(PETSC_SUCCESS);
1494: }
1496: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1497: {
1498: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1499: PetscInt i, j, n, row, col, bs, mbs;
1500: const PetscInt *ai, *aj;
1501: PetscReal atmp;
1502: const MatScalar *aa;
1503: PetscScalar *x;
1504: PetscInt ncols, brow, bcol, krow, kcol;
1506: PetscFunctionBegin;
1507: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1508: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1509: bs = A->rmap->bs;
1510: aa = a->a;
1511: ai = a->i;
1512: aj = a->j;
1513: mbs = a->mbs;
1515: PetscCall(VecSet(v, 0.0));
1516: PetscCall(VecGetArray(v, &x));
1517: PetscCall(VecGetLocalSize(v, &n));
1518: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1519: for (i = 0; i < mbs; i++) {
1520: ncols = ai[1] - ai[0];
1521: ai++;
1522: brow = bs * i;
1523: for (j = 0; j < ncols; j++) {
1524: bcol = bs * (*aj);
1525: for (kcol = 0; kcol < bs; kcol++) {
1526: col = bcol + kcol; /* col index */
1527: for (krow = 0; krow < bs; krow++) {
1528: atmp = PetscAbsScalar(*aa);
1529: aa++;
1530: row = brow + krow; /* row index */
1531: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1532: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1533: }
1534: }
1535: aj++;
1536: }
1537: }
1538: PetscCall(VecRestoreArray(v, &x));
1539: PetscFunctionReturn(PETSC_SUCCESS);
1540: }
1542: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1543: {
1544: PetscFunctionBegin;
1545: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1546: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1547: PetscFunctionReturn(PETSC_SUCCESS);
1548: }
1550: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1551: {
1552: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1553: PetscScalar *z = c;
1554: const PetscScalar *xb;
1555: PetscScalar x1;
1556: const MatScalar *v = a->a, *vv;
1557: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1558: #if defined(PETSC_USE_COMPLEX)
1559: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1560: #else
1561: const int aconj = 0;
1562: #endif
1564: PetscFunctionBegin;
1565: for (i = 0; i < mbs; i++) {
1566: n = ii[1] - ii[0];
1567: ii++;
1568: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1569: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1570: jj = idx;
1571: vv = v;
1572: for (k = 0; k < cn; k++) {
1573: idx = jj;
1574: v = vv;
1575: for (j = 0; j < n; j++) {
1576: xb = b + (*idx);
1577: x1 = xb[0 + k * bm];
1578: z[0 + k * cm] += v[0] * x1;
1579: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1580: v += 1;
1581: ++idx;
1582: }
1583: }
1584: z += 1;
1585: }
1586: PetscFunctionReturn(PETSC_SUCCESS);
1587: }
1589: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1590: {
1591: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1592: PetscScalar *z = c;
1593: const PetscScalar *xb;
1594: PetscScalar x1, x2;
1595: const MatScalar *v = a->a, *vv;
1596: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1598: PetscFunctionBegin;
1599: for (i = 0; i < mbs; i++) {
1600: n = ii[1] - ii[0];
1601: ii++;
1602: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1603: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1604: jj = idx;
1605: vv = v;
1606: for (k = 0; k < cn; k++) {
1607: idx = jj;
1608: v = vv;
1609: for (j = 0; j < n; j++) {
1610: xb = b + 2 * (*idx);
1611: x1 = xb[0 + k * bm];
1612: x2 = xb[1 + k * bm];
1613: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1614: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1615: if (*idx != i) {
1616: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1617: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1618: }
1619: v += 4;
1620: ++idx;
1621: }
1622: }
1623: z += 2;
1624: }
1625: PetscFunctionReturn(PETSC_SUCCESS);
1626: }
1628: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1629: {
1630: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1631: PetscScalar *z = c;
1632: const PetscScalar *xb;
1633: PetscScalar x1, x2, x3;
1634: const MatScalar *v = a->a, *vv;
1635: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1637: PetscFunctionBegin;
1638: for (i = 0; i < mbs; i++) {
1639: n = ii[1] - ii[0];
1640: ii++;
1641: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1642: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1643: jj = idx;
1644: vv = v;
1645: for (k = 0; k < cn; k++) {
1646: idx = jj;
1647: v = vv;
1648: for (j = 0; j < n; j++) {
1649: xb = b + 3 * (*idx);
1650: x1 = xb[0 + k * bm];
1651: x2 = xb[1 + k * bm];
1652: x3 = xb[2 + k * bm];
1653: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1654: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1655: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1656: if (*idx != i) {
1657: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1658: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1659: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1660: }
1661: v += 9;
1662: ++idx;
1663: }
1664: }
1665: z += 3;
1666: }
1667: PetscFunctionReturn(PETSC_SUCCESS);
1668: }
1670: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1671: {
1672: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1673: PetscScalar *z = c;
1674: const PetscScalar *xb;
1675: PetscScalar x1, x2, x3, x4;
1676: const MatScalar *v = a->a, *vv;
1677: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1679: PetscFunctionBegin;
1680: for (i = 0; i < mbs; i++) {
1681: n = ii[1] - ii[0];
1682: ii++;
1683: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1684: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1685: jj = idx;
1686: vv = v;
1687: for (k = 0; k < cn; k++) {
1688: idx = jj;
1689: v = vv;
1690: for (j = 0; j < n; j++) {
1691: xb = b + 4 * (*idx);
1692: x1 = xb[0 + k * bm];
1693: x2 = xb[1 + k * bm];
1694: x3 = xb[2 + k * bm];
1695: x4 = xb[3 + k * bm];
1696: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1697: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1698: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1699: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1700: if (*idx != i) {
1701: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1702: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1703: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1704: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1705: }
1706: v += 16;
1707: ++idx;
1708: }
1709: }
1710: z += 4;
1711: }
1712: PetscFunctionReturn(PETSC_SUCCESS);
1713: }
1715: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1716: {
1717: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1718: PetscScalar *z = c;
1719: const PetscScalar *xb;
1720: PetscScalar x1, x2, x3, x4, x5;
1721: const MatScalar *v = a->a, *vv;
1722: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1724: PetscFunctionBegin;
1725: for (i = 0; i < mbs; i++) {
1726: n = ii[1] - ii[0];
1727: ii++;
1728: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1729: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1730: jj = idx;
1731: vv = v;
1732: for (k = 0; k < cn; k++) {
1733: idx = jj;
1734: v = vv;
1735: for (j = 0; j < n; j++) {
1736: xb = b + 5 * (*idx);
1737: x1 = xb[0 + k * bm];
1738: x2 = xb[1 + k * bm];
1739: x3 = xb[2 + k * bm];
1740: x4 = xb[3 + k * bm];
1741: x5 = xb[4 + k * cm];
1742: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1743: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1744: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1745: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1746: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1747: if (*idx != i) {
1748: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1749: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1750: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1751: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1752: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1753: }
1754: v += 25;
1755: ++idx;
1756: }
1757: }
1758: z += 5;
1759: }
1760: PetscFunctionReturn(PETSC_SUCCESS);
1761: }
1763: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1764: {
1765: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1766: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1767: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1768: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1769: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1770: PetscBLASInt bbs, bcn, bbm, bcm;
1771: PetscScalar *z = NULL;
1772: PetscScalar *c, *b;
1773: const MatScalar *v;
1774: const PetscInt *idx, *ii;
1775: PetscScalar _DOne = 1.0;
1777: PetscFunctionBegin;
1778: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1779: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1780: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1781: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1782: b = bd->v;
1783: PetscCall(MatZeroEntries(C));
1784: PetscCall(MatDenseGetArray(C, &c));
1785: switch (bs) {
1786: case 1:
1787: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1788: break;
1789: case 2:
1790: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1791: break;
1792: case 3:
1793: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1794: break;
1795: case 4:
1796: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1797: break;
1798: case 5:
1799: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1800: break;
1801: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1802: PetscCall(PetscBLASIntCast(bs, &bbs));
1803: PetscCall(PetscBLASIntCast(cn, &bcn));
1804: PetscCall(PetscBLASIntCast(bm, &bbm));
1805: PetscCall(PetscBLASIntCast(cm, &bcm));
1806: idx = a->j;
1807: v = a->a;
1808: mbs = a->mbs;
1809: ii = a->i;
1810: z = c;
1811: for (i = 0; i < mbs; i++) {
1812: n = ii[1] - ii[0];
1813: ii++;
1814: for (j = 0; j < n; j++) {
1815: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1816: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1817: v += bs2;
1818: }
1819: z += bs;
1820: }
1821: }
1822: PetscCall(MatDenseRestoreArray(C, &c));
1823: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1824: PetscFunctionReturn(PETSC_SUCCESS);
1825: }