Actual source code: sell.c
2: /*
3: Defines the basic matrix operations for the SELL matrix storage format.
4: */
5: #include <../src/mat/impls/sell/seq/sell.h>
6: #include <petscblaslapack.h>
7: #include <petsc/private/kernels/blocktranspose.h>
9: static PetscBool cited = PETSC_FALSE;
10: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14: " year = 2018\n"
15: "}\n";
17: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19: #include <immintrin.h>
21: #if !defined(_MM_SCALE_8)
22: #define _MM_SCALE_8 8
23: #endif
25: #if defined(__AVX512F__)
26: /* these do not work
27: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29: */
30: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
33: vec_vals = _mm512_loadu_pd(aval); \
34: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36: #elif defined(__AVX2__) && defined(__FMA__)
37: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38: vec_vals = _mm256_loadu_pd(aval); \
39: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42: #endif
43: #endif /* PETSC_HAVE_IMMINTRIN_H */
45: /*@C
46: MatSeqSELLSetPreallocation - For good matrix assembly performance
47: the user should preallocate the matrix storage by setting the parameter nz
48: (or the array nnz). By setting these parameters accurately, performance
49: during matrix assembly can be increased significantly.
51: Collective
53: Input Parameters:
54: + B - The `MATSEQSELL` matrix
55: . rlenmax - number of nonzeros per row (same for all rows)
56: - rlen - array containing the number of nonzeros in the various rows
57: (possibly different for each row) or `NULL`
59: Level: intermediate
61: Notes:
62: If `rlen` is given then `rlenmax` is ignored.
64: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
65: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
66: allocation. For large problems you MUST preallocate memory or you
67: will get TERRIBLE performance, see the users' manual chapter on matrices.
69: You can call `MatGetInfo()` to get information on how effective the preallocation was;
70: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
71: You can also run with the option `-info` and look for messages with the string
72: malloc in them to see if additional memory allocation was needed.
74: Developer Note:
75: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
76: entries or columns indices.
78: The maximum number of nonzeos in any row should be as accurate as possible.
79: If it is underestimated, you will get bad performance due to reallocation
80: (`MatSeqXSELLReallocateSELL()`).
82: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83: @*/
84: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
85: {
86: PetscFunctionBegin;
89: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
90: PetscFunctionReturn(PETSC_SUCCESS);
91: }
93: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
94: {
95: Mat_SeqSELL *b;
96: PetscInt i, j, totalslices;
97: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
99: PetscFunctionBegin;
100: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
101: if (maxallocrow == MAT_SKIP_ALLOCATION) {
102: skipallocation = PETSC_TRUE;
103: maxallocrow = 0;
104: }
106: PetscCall(PetscLayoutSetUp(B->rmap));
107: PetscCall(PetscLayoutSetUp(B->cmap));
109: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
110: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
111: PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
112: if (rlen) {
113: for (i = 0; i < B->rmap->n; i++) {
114: PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
115: PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
116: }
117: }
119: B->preallocated = PETSC_TRUE;
121: b = (Mat_SeqSELL *)B->data;
123: totalslices = PetscCeilInt(B->rmap->n, 8);
124: b->totalslices = totalslices;
125: if (!skipallocation) {
126: if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
128: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
129: PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
130: }
131: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
132: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
133: else if (maxallocrow < 0) maxallocrow = 1;
134: for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
135: } else {
136: maxallocrow = 0;
137: b->sliidx[0] = 0;
138: for (i = 1; i < totalslices; i++) {
139: b->sliidx[i] = 0;
140: for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
141: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
142: PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
143: }
144: /* last slice */
145: b->sliidx[totalslices] = 0;
146: for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
147: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
148: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
149: }
151: /* allocate space for val, colidx, rlen */
152: /* FIXME: should B's old memory be unlogged? */
153: PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
154: /* FIXME: assuming an element of the bit array takes 8 bits */
155: PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
156: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
157: PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
159: b->singlemalloc = PETSC_TRUE;
160: b->free_val = PETSC_TRUE;
161: b->free_colidx = PETSC_TRUE;
162: } else {
163: b->free_val = PETSC_FALSE;
164: b->free_colidx = PETSC_FALSE;
165: }
167: b->nz = 0;
168: b->maxallocrow = maxallocrow;
169: b->rlenmax = maxallocrow;
170: b->maxallocmat = b->sliidx[totalslices];
171: B->info.nz_unneeded = (double)b->maxallocmat;
172: if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
173: PetscFunctionReturn(PETSC_SUCCESS);
174: }
176: PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
177: {
178: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
179: PetscInt shift;
181: PetscFunctionBegin;
182: PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
183: if (nz) *nz = a->rlen[row];
184: shift = a->sliidx[row >> 3] + (row & 0x07);
185: if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
186: if (idx) {
187: PetscInt j;
188: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
189: *idx = a->getrowcols;
190: }
191: if (v) {
192: PetscInt j;
193: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
194: *v = a->getrowvals;
195: }
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
200: {
201: PetscFunctionBegin;
202: PetscFunctionReturn(PETSC_SUCCESS);
203: }
205: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
206: {
207: Mat B;
208: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
209: PetscInt i;
211: PetscFunctionBegin;
212: if (reuse == MAT_REUSE_MATRIX) {
213: B = *newmat;
214: PetscCall(MatZeroEntries(B));
215: } else {
216: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
217: PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
218: PetscCall(MatSetType(B, MATSEQAIJ));
219: PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
220: }
222: for (i = 0; i < A->rmap->n; i++) {
223: PetscInt nz = 0, *cols = NULL;
224: PetscScalar *vals = NULL;
226: PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
227: PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
228: PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
229: }
231: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
232: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
233: B->rmap->bs = A->rmap->bs;
235: if (reuse == MAT_INPLACE_MATRIX) {
236: PetscCall(MatHeaderReplace(A, &B));
237: } else {
238: *newmat = B;
239: }
240: PetscFunctionReturn(PETSC_SUCCESS);
241: }
243: #include <../src/mat/impls/aij/seq/aij.h>
245: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
246: {
247: Mat B;
248: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
250: const PetscInt *cols;
251: const PetscScalar *vals;
253: PetscFunctionBegin;
255: if (reuse == MAT_REUSE_MATRIX) {
256: B = *newmat;
257: } else {
258: if (PetscDefined(USE_DEBUG) || !a->ilen) {
259: PetscCall(PetscMalloc1(m, &rowlengths));
260: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
261: }
262: if (PetscDefined(USE_DEBUG) && a->ilen) {
263: PetscBool eq;
264: PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
265: PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
266: PetscCall(PetscFree(rowlengths));
267: rowlengths = a->ilen;
268: } else if (a->ilen) rowlengths = a->ilen;
269: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
270: PetscCall(MatSetSizes(B, m, n, m, n));
271: PetscCall(MatSetType(B, MATSEQSELL));
272: PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
273: if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
274: }
276: for (row = 0; row < m; row++) {
277: PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
278: PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
279: PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
280: }
281: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
282: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
283: B->rmap->bs = A->rmap->bs;
285: if (reuse == MAT_INPLACE_MATRIX) {
286: PetscCall(MatHeaderReplace(A, &B));
287: } else {
288: *newmat = B;
289: }
290: PetscFunctionReturn(PETSC_SUCCESS);
291: }
293: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
294: {
295: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
296: PetscScalar *y;
297: const PetscScalar *x;
298: const MatScalar *aval = a->val;
299: PetscInt totalslices = a->totalslices;
300: const PetscInt *acolidx = a->colidx;
301: PetscInt i, j;
302: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
303: __m512d vec_x, vec_y, vec_vals;
304: __m256i vec_idx;
305: __mmask8 mask;
306: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
307: __m256i vec_idx2, vec_idx3, vec_idx4;
308: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309: __m128i vec_idx;
310: __m256d vec_x, vec_y, vec_y2, vec_vals;
311: MatScalar yval;
312: PetscInt r, rows_left, row, nnz_in_row;
313: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
314: __m128d vec_x_tmp;
315: __m256d vec_x, vec_y, vec_y2, vec_vals;
316: MatScalar yval;
317: PetscInt r, rows_left, row, nnz_in_row;
318: #else
319: PetscScalar sum[8];
320: #endif
322: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
323: #pragma disjoint(*x, *y, *aval)
324: #endif
326: PetscFunctionBegin;
327: PetscCall(VecGetArrayRead(xx, &x));
328: PetscCall(VecGetArray(yy, &y));
329: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330: for (i = 0; i < totalslices; i++) { /* loop over slices */
331: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
332: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
334: vec_y = _mm512_setzero_pd();
335: vec_y2 = _mm512_setzero_pd();
336: vec_y3 = _mm512_setzero_pd();
337: vec_y4 = _mm512_setzero_pd();
339: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
340: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
341: case 3:
342: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
343: acolidx += 8;
344: aval += 8;
345: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
346: acolidx += 8;
347: aval += 8;
348: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
349: acolidx += 8;
350: aval += 8;
351: j += 3;
352: break;
353: case 2:
354: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
355: acolidx += 8;
356: aval += 8;
357: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
358: acolidx += 8;
359: aval += 8;
360: j += 2;
361: break;
362: case 1:
363: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
364: acolidx += 8;
365: aval += 8;
366: j += 1;
367: break;
368: }
369: #pragma novector
370: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
371: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372: acolidx += 8;
373: aval += 8;
374: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375: acolidx += 8;
376: aval += 8;
377: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378: acolidx += 8;
379: aval += 8;
380: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
381: acolidx += 8;
382: aval += 8;
383: }
385: vec_y = _mm512_add_pd(vec_y, vec_y2);
386: vec_y = _mm512_add_pd(vec_y, vec_y3);
387: vec_y = _mm512_add_pd(vec_y, vec_y4);
388: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
389: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
390: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
391: } else {
392: _mm512_storeu_pd(&y[8 * i], vec_y);
393: }
394: }
395: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
396: for (i = 0; i < totalslices; i++) { /* loop over full slices */
397: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
398: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
400: /* last slice may have padding rows. Don't use vectorization. */
401: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
402: rows_left = A->rmap->n - 8 * i;
403: for (r = 0; r < rows_left; ++r) {
404: yval = (MatScalar)0;
405: row = 8 * i + r;
406: nnz_in_row = a->rlen[row];
407: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
408: y[row] = yval;
409: }
410: break;
411: }
413: vec_y = _mm256_setzero_pd();
414: vec_y2 = _mm256_setzero_pd();
416: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
417: #pragma novector
418: #pragma unroll(2)
419: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
420: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
421: aval += 4;
422: acolidx += 4;
423: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
424: aval += 4;
425: acolidx += 4;
426: }
428: _mm256_storeu_pd(y + i * 8, vec_y);
429: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
430: }
431: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
432: for (i = 0; i < totalslices; i++) { /* loop over full slices */
433: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
434: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
436: vec_y = _mm256_setzero_pd();
437: vec_y2 = _mm256_setzero_pd();
439: /* last slice may have padding rows. Don't use vectorization. */
440: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
441: rows_left = A->rmap->n - 8 * i;
442: for (r = 0; r < rows_left; ++r) {
443: yval = (MatScalar)0;
444: row = 8 * i + r;
445: nnz_in_row = a->rlen[row];
446: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
447: y[row] = yval;
448: }
449: break;
450: }
452: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
453: #pragma novector
454: #pragma unroll(2)
455: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
456: vec_vals = _mm256_loadu_pd(aval);
457: vec_x_tmp = _mm_setzero_pd();
458: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
461: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
464: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
465: aval += 4;
467: vec_vals = _mm256_loadu_pd(aval);
468: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
471: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
472: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
473: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
474: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
475: aval += 4;
476: }
478: _mm256_storeu_pd(y + i * 8, vec_y);
479: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
480: }
481: #else
482: for (i = 0; i < totalslices; i++) { /* loop over slices */
483: for (j = 0; j < 8; j++) sum[j] = 0.0;
484: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
485: sum[0] += aval[j] * x[acolidx[j]];
486: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
487: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
488: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
489: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
490: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
491: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
492: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
493: }
494: if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
495: for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
496: } else {
497: for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
498: }
499: }
500: #endif
502: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
503: PetscCall(VecRestoreArrayRead(xx, &x));
504: PetscCall(VecRestoreArray(yy, &y));
505: PetscFunctionReturn(PETSC_SUCCESS);
506: }
508: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
509: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
510: {
511: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
512: PetscScalar *y, *z;
513: const PetscScalar *x;
514: const MatScalar *aval = a->val;
515: PetscInt totalslices = a->totalslices;
516: const PetscInt *acolidx = a->colidx;
517: PetscInt i, j;
518: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519: __m512d vec_x, vec_y, vec_vals;
520: __m256i vec_idx;
521: __mmask8 mask;
522: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
523: __m256i vec_idx2, vec_idx3, vec_idx4;
524: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
525: __m128d vec_x_tmp;
526: __m256d vec_x, vec_y, vec_y2, vec_vals;
527: MatScalar yval;
528: PetscInt r, row, nnz_in_row;
529: #else
530: PetscScalar sum[8];
531: #endif
533: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
534: #pragma disjoint(*x, *y, *aval)
535: #endif
537: PetscFunctionBegin;
538: PetscCall(VecGetArrayRead(xx, &x));
539: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
540: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
541: for (i = 0; i < totalslices; i++) { /* loop over slices */
542: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
543: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
545: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
546: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
547: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
548: } else {
549: vec_y = _mm512_loadu_pd(&y[8 * i]);
550: }
551: vec_y2 = _mm512_setzero_pd();
552: vec_y3 = _mm512_setzero_pd();
553: vec_y4 = _mm512_setzero_pd();
555: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
556: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
557: case 3:
558: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
559: acolidx += 8;
560: aval += 8;
561: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
562: acolidx += 8;
563: aval += 8;
564: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
565: acolidx += 8;
566: aval += 8;
567: j += 3;
568: break;
569: case 2:
570: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
571: acolidx += 8;
572: aval += 8;
573: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
574: acolidx += 8;
575: aval += 8;
576: j += 2;
577: break;
578: case 1:
579: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
580: acolidx += 8;
581: aval += 8;
582: j += 1;
583: break;
584: }
585: #pragma novector
586: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
587: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
588: acolidx += 8;
589: aval += 8;
590: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
591: acolidx += 8;
592: aval += 8;
593: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
594: acolidx += 8;
595: aval += 8;
596: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
597: acolidx += 8;
598: aval += 8;
599: }
601: vec_y = _mm512_add_pd(vec_y, vec_y2);
602: vec_y = _mm512_add_pd(vec_y, vec_y3);
603: vec_y = _mm512_add_pd(vec_y, vec_y4);
604: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
605: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
606: } else {
607: _mm512_storeu_pd(&z[8 * i], vec_y);
608: }
609: }
610: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
611: for (i = 0; i < totalslices; i++) { /* loop over full slices */
612: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
613: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
615: /* last slice may have padding rows. Don't use vectorization. */
616: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
617: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
618: row = 8 * i + r;
619: yval = (MatScalar)0.0;
620: nnz_in_row = a->rlen[row];
621: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
622: z[row] = y[row] + yval;
623: }
624: break;
625: }
627: vec_y = _mm256_loadu_pd(y + 8 * i);
628: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
630: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
631: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
632: vec_vals = _mm256_loadu_pd(aval);
633: vec_x_tmp = _mm_setzero_pd();
634: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636: vec_x = _mm256_setzero_pd();
637: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
638: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
639: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
640: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
641: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
642: aval += 4;
644: vec_vals = _mm256_loadu_pd(aval);
645: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
646: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
647: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
648: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
649: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
650: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
651: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
652: aval += 4;
653: }
655: _mm256_storeu_pd(z + i * 8, vec_y);
656: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
657: }
658: #else
659: for (i = 0; i < totalslices; i++) { /* loop over slices */
660: for (j = 0; j < 8; j++) sum[j] = 0.0;
661: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
662: sum[0] += aval[j] * x[acolidx[j]];
663: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
664: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
665: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
666: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
667: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
668: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
669: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
670: }
671: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
672: for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
673: } else {
674: for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
675: }
676: }
677: #endif
679: PetscCall(PetscLogFlops(2.0 * a->nz));
680: PetscCall(VecRestoreArrayRead(xx, &x));
681: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
682: PetscFunctionReturn(PETSC_SUCCESS);
683: }
685: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
686: {
687: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
688: PetscScalar *y;
689: const PetscScalar *x;
690: const MatScalar *aval = a->val;
691: const PetscInt *acolidx = a->colidx;
692: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices;
694: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
695: #pragma disjoint(*x, *y, *aval)
696: #endif
698: PetscFunctionBegin;
699: if (A->symmetric == PETSC_BOOL3_TRUE) {
700: PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
701: PetscFunctionReturn(PETSC_SUCCESS);
702: }
703: if (zz != yy) PetscCall(VecCopy(zz, yy));
704: PetscCall(VecGetArrayRead(xx, &x));
705: PetscCall(VecGetArray(yy, &y));
706: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
707: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
708: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
709: row = 8 * i + r;
710: nnz_in_row = a->rlen[row];
711: for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
712: }
713: break;
714: }
715: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
716: y[acolidx[j]] += aval[j] * x[8 * i];
717: y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
718: y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
719: y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
720: y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
721: y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
722: y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
723: y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
724: }
725: }
726: PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
727: PetscCall(VecRestoreArrayRead(xx, &x));
728: PetscCall(VecRestoreArray(yy, &y));
729: PetscFunctionReturn(PETSC_SUCCESS);
730: }
732: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
733: {
734: PetscFunctionBegin;
735: if (A->symmetric == PETSC_BOOL3_TRUE) {
736: PetscCall(MatMult_SeqSELL(A, xx, yy));
737: } else {
738: PetscCall(VecSet(yy, 0.0));
739: PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
740: }
741: PetscFunctionReturn(PETSC_SUCCESS);
742: }
744: /*
745: Checks for missing diagonals
746: */
747: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
748: {
749: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
750: PetscInt *diag, i;
752: PetscFunctionBegin;
753: *missing = PETSC_FALSE;
754: if (A->rmap->n > 0 && !(a->colidx)) {
755: *missing = PETSC_TRUE;
756: if (d) *d = 0;
757: PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
758: } else {
759: diag = a->diag;
760: for (i = 0; i < A->rmap->n; i++) {
761: if (diag[i] == -1) {
762: *missing = PETSC_TRUE;
763: if (d) *d = i;
764: PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
765: break;
766: }
767: }
768: }
769: PetscFunctionReturn(PETSC_SUCCESS);
770: }
772: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
773: {
774: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
775: PetscInt i, j, m = A->rmap->n, shift;
777: PetscFunctionBegin;
778: if (!a->diag) {
779: PetscCall(PetscMalloc1(m, &a->diag));
780: a->free_diag = PETSC_TRUE;
781: }
782: for (i = 0; i < m; i++) { /* loop over rows */
783: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
784: a->diag[i] = -1;
785: for (j = 0; j < a->rlen[i]; j++) {
786: if (a->colidx[shift + j * 8] == i) {
787: a->diag[i] = shift + j * 8;
788: break;
789: }
790: }
791: }
792: PetscFunctionReturn(PETSC_SUCCESS);
793: }
795: /*
796: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
797: */
798: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
799: {
800: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
801: PetscInt i, *diag, m = A->rmap->n;
802: MatScalar *val = a->val;
803: PetscScalar *idiag, *mdiag;
805: PetscFunctionBegin;
806: if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
807: PetscCall(MatMarkDiagonal_SeqSELL(A));
808: diag = a->diag;
809: if (!a->idiag) {
810: PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
811: val = a->val;
812: }
813: mdiag = a->mdiag;
814: idiag = a->idiag;
816: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
817: for (i = 0; i < m; i++) {
818: mdiag[i] = val[diag[i]];
819: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
820: PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
821: PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
822: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
823: A->factorerror_zeropivot_value = 0.0;
824: A->factorerror_zeropivot_row = i;
825: }
826: idiag[i] = 1.0 / val[diag[i]];
827: }
828: PetscCall(PetscLogFlops(m));
829: } else {
830: for (i = 0; i < m; i++) {
831: mdiag[i] = val[diag[i]];
832: idiag[i] = omega / (fshift + val[diag[i]]);
833: }
834: PetscCall(PetscLogFlops(2.0 * m));
835: }
836: a->idiagvalid = PETSC_TRUE;
837: PetscFunctionReturn(PETSC_SUCCESS);
838: }
840: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
841: {
842: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
844: PetscFunctionBegin;
845: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
846: PetscCall(MatSeqSELLInvalidateDiagonal(A));
847: PetscFunctionReturn(PETSC_SUCCESS);
848: }
850: PetscErrorCode MatDestroy_SeqSELL(Mat A)
851: {
852: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
854: PetscFunctionBegin;
855: #if defined(PETSC_USE_LOG)
856: PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
857: #endif
858: PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
859: PetscCall(ISDestroy(&a->row));
860: PetscCall(ISDestroy(&a->col));
861: PetscCall(PetscFree(a->diag));
862: PetscCall(PetscFree(a->rlen));
863: PetscCall(PetscFree(a->sliidx));
864: PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
865: PetscCall(PetscFree(a->solve_work));
866: PetscCall(ISDestroy(&a->icol));
867: PetscCall(PetscFree(a->saved_values));
868: PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
870: PetscCall(PetscFree(A->data));
872: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
873: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
874: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
875: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
876: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
877: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
878: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
879: PetscFunctionReturn(PETSC_SUCCESS);
880: }
882: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
883: {
884: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
886: PetscFunctionBegin;
887: switch (op) {
888: case MAT_ROW_ORIENTED:
889: a->roworiented = flg;
890: break;
891: case MAT_KEEP_NONZERO_PATTERN:
892: a->keepnonzeropattern = flg;
893: break;
894: case MAT_NEW_NONZERO_LOCATIONS:
895: a->nonew = (flg ? 0 : 1);
896: break;
897: case MAT_NEW_NONZERO_LOCATION_ERR:
898: a->nonew = (flg ? -1 : 0);
899: break;
900: case MAT_NEW_NONZERO_ALLOCATION_ERR:
901: a->nonew = (flg ? -2 : 0);
902: break;
903: case MAT_UNUSED_NONZERO_LOCATION_ERR:
904: a->nounused = (flg ? -1 : 0);
905: break;
906: case MAT_FORCE_DIAGONAL_ENTRIES:
907: case MAT_IGNORE_OFF_PROC_ENTRIES:
908: case MAT_USE_HASH_TABLE:
909: case MAT_SORTED_FULL:
910: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
911: break;
912: case MAT_SPD:
913: case MAT_SYMMETRIC:
914: case MAT_STRUCTURALLY_SYMMETRIC:
915: case MAT_HERMITIAN:
916: case MAT_SYMMETRY_ETERNAL:
917: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
918: case MAT_SPD_ETERNAL:
919: /* These options are handled directly by MatSetOption() */
920: break;
921: default:
922: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
923: }
924: PetscFunctionReturn(PETSC_SUCCESS);
925: }
927: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
928: {
929: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
930: PetscInt i, j, n, shift;
931: PetscScalar *x, zero = 0.0;
933: PetscFunctionBegin;
934: PetscCall(VecGetLocalSize(v, &n));
935: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
937: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
938: PetscInt *diag = a->diag;
939: PetscCall(VecGetArray(v, &x));
940: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
941: PetscCall(VecRestoreArray(v, &x));
942: PetscFunctionReturn(PETSC_SUCCESS);
943: }
945: PetscCall(VecSet(v, zero));
946: PetscCall(VecGetArray(v, &x));
947: for (i = 0; i < n; i++) { /* loop over rows */
948: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
949: x[i] = 0;
950: for (j = 0; j < a->rlen[i]; j++) {
951: if (a->colidx[shift + j * 8] == i) {
952: x[i] = a->val[shift + j * 8];
953: break;
954: }
955: }
956: }
957: PetscCall(VecRestoreArray(v, &x));
958: PetscFunctionReturn(PETSC_SUCCESS);
959: }
961: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
962: {
963: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
964: const PetscScalar *l, *r;
965: PetscInt i, j, m, n, row;
967: PetscFunctionBegin;
968: if (ll) {
969: /* The local size is used so that VecMPI can be passed to this routine
970: by MatDiagonalScale_MPISELL */
971: PetscCall(VecGetLocalSize(ll, &m));
972: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
973: PetscCall(VecGetArrayRead(ll, &l));
974: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
975: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
976: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
977: if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
978: }
979: } else {
980: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
981: }
982: }
983: PetscCall(VecRestoreArrayRead(ll, &l));
984: PetscCall(PetscLogFlops(a->nz));
985: }
986: if (rr) {
987: PetscCall(VecGetLocalSize(rr, &n));
988: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
989: PetscCall(VecGetArrayRead(rr, &r));
990: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
991: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
992: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
993: if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
994: }
995: } else {
996: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
997: }
998: }
999: PetscCall(VecRestoreArrayRead(rr, &r));
1000: PetscCall(PetscLogFlops(a->nz));
1001: }
1002: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1003: PetscFunctionReturn(PETSC_SUCCESS);
1004: }
1006: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1007: {
1008: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1009: PetscInt *cp, i, k, low, high, t, row, col, l;
1010: PetscInt shift;
1011: MatScalar *vp;
1013: PetscFunctionBegin;
1014: for (k = 0; k < m; k++) { /* loop over requested rows */
1015: row = im[k];
1016: if (row < 0) continue;
1017: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1018: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1019: cp = a->colidx + shift; /* pointer to the row */
1020: vp = a->val + shift; /* pointer to the row */
1021: for (l = 0; l < n; l++) { /* loop over requested columns */
1022: col = in[l];
1023: if (col < 0) continue;
1024: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1025: high = a->rlen[row];
1026: low = 0; /* assume unsorted */
1027: while (high - low > 5) {
1028: t = (low + high) / 2;
1029: if (*(cp + t * 8) > col) high = t;
1030: else low = t;
1031: }
1032: for (i = low; i < high; i++) {
1033: if (*(cp + 8 * i) > col) break;
1034: if (*(cp + 8 * i) == col) {
1035: *v++ = *(vp + 8 * i);
1036: goto finished;
1037: }
1038: }
1039: *v++ = 0.0;
1040: finished:;
1041: }
1042: }
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1047: {
1048: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1049: PetscInt i, j, m = A->rmap->n, shift;
1050: const char *name;
1051: PetscViewerFormat format;
1053: PetscFunctionBegin;
1054: PetscCall(PetscViewerGetFormat(viewer, &format));
1055: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1056: PetscInt nofinalvalue = 0;
1057: /*
1058: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1059: nofinalvalue = 1;
1060: }
1061: */
1062: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1063: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1064: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1065: #if defined(PETSC_USE_COMPLEX)
1066: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1067: #else
1068: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1069: #endif
1070: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1072: for (i = 0; i < m; i++) {
1073: shift = a->sliidx[i >> 3] + (i & 0x07);
1074: for (j = 0; j < a->rlen[i]; j++) {
1075: #if defined(PETSC_USE_COMPLEX)
1076: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1077: #else
1078: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1079: #endif
1080: }
1081: }
1082: /*
1083: if (nofinalvalue) {
1084: #if defined(PETSC_USE_COMPLEX)
1085: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1086: #else
1087: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1088: #endif
1089: }
1090: */
1091: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1092: PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1093: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1094: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1095: PetscFunctionReturn(PETSC_SUCCESS);
1096: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1097: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098: for (i = 0; i < m; i++) {
1099: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1100: shift = a->sliidx[i >> 3] + (i & 0x07);
1101: for (j = 0; j < a->rlen[i]; j++) {
1102: #if defined(PETSC_USE_COMPLEX)
1103: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1104: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1105: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1106: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1107: } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1108: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1109: }
1110: #else
1111: if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1112: #endif
1113: }
1114: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1115: }
1116: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1117: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1118: PetscInt cnt = 0, jcnt;
1119: PetscScalar value;
1120: #if defined(PETSC_USE_COMPLEX)
1121: PetscBool realonly = PETSC_TRUE;
1122: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1123: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1124: realonly = PETSC_FALSE;
1125: break;
1126: }
1127: }
1128: #endif
1130: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1131: for (i = 0; i < m; i++) {
1132: jcnt = 0;
1133: shift = a->sliidx[i >> 3] + (i & 0x07);
1134: for (j = 0; j < A->cmap->n; j++) {
1135: if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1136: value = a->val[cnt++];
1137: jcnt++;
1138: } else {
1139: value = 0.0;
1140: }
1141: #if defined(PETSC_USE_COMPLEX)
1142: if (realonly) {
1143: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1144: } else {
1145: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1146: }
1147: #else
1148: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1149: #endif
1150: }
1151: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152: }
1153: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1155: PetscInt fshift = 1;
1156: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1157: #if defined(PETSC_USE_COMPLEX)
1158: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1159: #else
1160: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1161: #endif
1162: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1163: for (i = 0; i < m; i++) {
1164: shift = a->sliidx[i >> 3] + (i & 0x07);
1165: for (j = 0; j < a->rlen[i]; j++) {
1166: #if defined(PETSC_USE_COMPLEX)
1167: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1168: #else
1169: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1170: #endif
1171: }
1172: }
1173: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1174: } else if (format == PETSC_VIEWER_NATIVE) {
1175: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1176: PetscInt row;
1177: PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1178: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1179: #if defined(PETSC_USE_COMPLEX)
1180: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1181: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1182: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1183: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1184: } else {
1185: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1186: }
1187: #else
1188: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1189: #endif
1190: }
1191: }
1192: } else {
1193: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194: if (A->factortype) {
1195: for (i = 0; i < m; i++) {
1196: shift = a->sliidx[i >> 3] + (i & 0x07);
1197: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1198: /* L part */
1199: for (j = shift; j < a->diag[i]; j += 8) {
1200: #if defined(PETSC_USE_COMPLEX)
1201: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1202: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1203: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1204: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1205: } else {
1206: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1207: }
1208: #else
1209: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1210: #endif
1211: }
1212: /* diagonal */
1213: j = a->diag[i];
1214: #if defined(PETSC_USE_COMPLEX)
1215: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1217: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1219: } else {
1220: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1221: }
1222: #else
1223: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1224: #endif
1226: /* U part */
1227: for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1228: #if defined(PETSC_USE_COMPLEX)
1229: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1231: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1233: } else {
1234: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1235: }
1236: #else
1237: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1238: #endif
1239: }
1240: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1241: }
1242: } else {
1243: for (i = 0; i < m; i++) {
1244: shift = a->sliidx[i >> 3] + (i & 0x07);
1245: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1246: for (j = 0; j < a->rlen[i]; j++) {
1247: #if defined(PETSC_USE_COMPLEX)
1248: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1250: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1252: } else {
1253: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1254: }
1255: #else
1256: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1257: #endif
1258: }
1259: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1260: }
1261: }
1262: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1263: }
1264: PetscCall(PetscViewerFlush(viewer));
1265: PetscFunctionReturn(PETSC_SUCCESS);
1266: }
1268: #include <petscdraw.h>
1269: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1270: {
1271: Mat A = (Mat)Aa;
1272: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1273: PetscInt i, j, m = A->rmap->n, shift;
1274: int color;
1275: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1276: PetscViewer viewer;
1277: PetscViewerFormat format;
1279: PetscFunctionBegin;
1280: PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1281: PetscCall(PetscViewerGetFormat(viewer, &format));
1282: PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1284: /* loop over matrix elements drawing boxes */
1286: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1287: PetscDrawCollectiveBegin(draw);
1288: /* Blue for negative, Cyan for zero and Red for positive */
1289: color = PETSC_DRAW_BLUE;
1290: for (i = 0; i < m; i++) {
1291: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1292: y_l = m - i - 1.0;
1293: y_r = y_l + 1.0;
1294: for (j = 0; j < a->rlen[i]; j++) {
1295: x_l = a->colidx[shift + j * 8];
1296: x_r = x_l + 1.0;
1297: if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1298: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1299: }
1300: }
1301: color = PETSC_DRAW_CYAN;
1302: for (i = 0; i < m; i++) {
1303: shift = a->sliidx[i >> 3] + (i & 0x07);
1304: y_l = m - i - 1.0;
1305: y_r = y_l + 1.0;
1306: for (j = 0; j < a->rlen[i]; j++) {
1307: x_l = a->colidx[shift + j * 8];
1308: x_r = x_l + 1.0;
1309: if (a->val[shift + 8 * j] != 0.) continue;
1310: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1311: }
1312: }
1313: color = PETSC_DRAW_RED;
1314: for (i = 0; i < m; i++) {
1315: shift = a->sliidx[i >> 3] + (i & 0x07);
1316: y_l = m - i - 1.0;
1317: y_r = y_l + 1.0;
1318: for (j = 0; j < a->rlen[i]; j++) {
1319: x_l = a->colidx[shift + j * 8];
1320: x_r = x_l + 1.0;
1321: if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1322: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1323: }
1324: }
1325: PetscDrawCollectiveEnd(draw);
1326: } else {
1327: /* use contour shading to indicate magnitude of values */
1328: /* first determine max of all nonzero values */
1329: PetscReal minv = 0.0, maxv = 0.0;
1330: PetscInt count = 0;
1331: PetscDraw popup;
1332: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1333: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1334: }
1335: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1336: PetscCall(PetscDrawGetPopup(draw, &popup));
1337: PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1339: PetscDrawCollectiveBegin(draw);
1340: for (i = 0; i < m; i++) {
1341: shift = a->sliidx[i >> 3] + (i & 0x07);
1342: y_l = m - i - 1.0;
1343: y_r = y_l + 1.0;
1344: for (j = 0; j < a->rlen[i]; j++) {
1345: x_l = a->colidx[shift + j * 8];
1346: x_r = x_l + 1.0;
1347: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1348: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1349: count++;
1350: }
1351: }
1352: PetscDrawCollectiveEnd(draw);
1353: }
1354: PetscFunctionReturn(PETSC_SUCCESS);
1355: }
1357: #include <petscdraw.h>
1358: PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1359: {
1360: PetscDraw draw;
1361: PetscReal xr, yr, xl, yl, h, w;
1362: PetscBool isnull;
1364: PetscFunctionBegin;
1365: PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1366: PetscCall(PetscDrawIsNull(draw, &isnull));
1367: if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1369: xr = A->cmap->n;
1370: yr = A->rmap->n;
1371: h = yr / 10.0;
1372: w = xr / 10.0;
1373: xr += w;
1374: yr += h;
1375: xl = -w;
1376: yl = -h;
1377: PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1378: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1379: PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1380: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1381: PetscCall(PetscDrawSave(draw));
1382: PetscFunctionReturn(PETSC_SUCCESS);
1383: }
1385: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1386: {
1387: PetscBool iascii, isbinary, isdraw;
1389: PetscFunctionBegin;
1390: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1391: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1392: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1393: if (iascii) {
1394: PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1395: } else if (isbinary) {
1396: /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1397: } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1398: PetscFunctionReturn(PETSC_SUCCESS);
1399: }
1401: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1402: {
1403: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1404: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1405: MatScalar *vp;
1407: PetscFunctionBegin;
1408: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1409: /* To do: compress out the unused elements */
1410: PetscCall(MatMarkDiagonal_SeqSELL(A));
1411: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1412: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1413: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1414: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1415: for (i = 0; i < a->totalslices; ++i) {
1416: shift = a->sliidx[i]; /* starting index of the slice */
1417: cp = a->colidx + shift; /* pointer to the column indices of the slice */
1418: vp = a->val + shift; /* pointer to the nonzero values of the slice */
1419: for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1420: row = 8 * i + row_in_slice;
1421: nrow = a->rlen[row]; /* number of nonzeros in row */
1422: /*
1423: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1424: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1425: */
1426: lastcol = 0;
1427: if (nrow > 0) { /* nonempty row */
1428: lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1429: } else if (!row_in_slice) { /* first row of the currect slice is empty */
1430: for (j = 1; j < 8; j++) {
1431: if (a->rlen[8 * i + j]) {
1432: lastcol = cp[j];
1433: break;
1434: }
1435: }
1436: } else {
1437: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1438: }
1440: for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1441: cp[8 * k + row_in_slice] = lastcol;
1442: vp[8 * k + row_in_slice] = (MatScalar)0;
1443: }
1444: }
1445: }
1447: A->info.mallocs += a->reallocs;
1448: a->reallocs = 0;
1450: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1451: PetscFunctionReturn(PETSC_SUCCESS);
1452: }
1454: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1455: {
1456: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1458: PetscFunctionBegin;
1459: info->block_size = 1.0;
1460: info->nz_allocated = a->maxallocmat;
1461: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1462: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1463: info->assemblies = A->num_ass;
1464: info->mallocs = A->info.mallocs;
1465: info->memory = 0; /* REVIEW ME */
1466: if (A->factortype) {
1467: info->fill_ratio_given = A->info.fill_ratio_given;
1468: info->fill_ratio_needed = A->info.fill_ratio_needed;
1469: info->factor_mallocs = A->info.factor_mallocs;
1470: } else {
1471: info->fill_ratio_given = 0;
1472: info->fill_ratio_needed = 0;
1473: info->factor_mallocs = 0;
1474: }
1475: PetscFunctionReturn(PETSC_SUCCESS);
1476: }
1478: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1479: {
1480: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1481: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1482: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1483: MatScalar *vp, value;
1485: PetscFunctionBegin;
1486: for (k = 0; k < m; k++) { /* loop over added rows */
1487: row = im[k];
1488: if (row < 0) continue;
1489: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1490: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1491: cp = a->colidx + shift; /* pointer to the row */
1492: vp = a->val + shift; /* pointer to the row */
1493: nrow = a->rlen[row];
1494: low = 0;
1495: high = nrow;
1497: for (l = 0; l < n; l++) { /* loop over added columns */
1498: col = in[l];
1499: if (col < 0) continue;
1500: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1501: if (a->roworiented) {
1502: value = v[l + k * n];
1503: } else {
1504: value = v[k + l * m];
1505: }
1506: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1508: /* search in this row for the specified column, i indicates the column to be set */
1509: if (col <= lastcol) low = 0;
1510: else high = nrow;
1511: lastcol = col;
1512: while (high - low > 5) {
1513: t = (low + high) / 2;
1514: if (*(cp + t * 8) > col) high = t;
1515: else low = t;
1516: }
1517: for (i = low; i < high; i++) {
1518: if (*(cp + i * 8) > col) break;
1519: if (*(cp + i * 8) == col) {
1520: if (is == ADD_VALUES) *(vp + i * 8) += value;
1521: else *(vp + i * 8) = value;
1522: low = i + 1;
1523: goto noinsert;
1524: }
1525: }
1526: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1527: if (nonew == 1) goto noinsert;
1528: PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1529: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1530: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1531: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1532: for (ii = nrow - 1; ii >= i; ii--) {
1533: *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1534: *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1535: }
1536: a->rlen[row]++;
1537: *(cp + i * 8) = col;
1538: *(vp + i * 8) = value;
1539: a->nz++;
1540: A->nonzerostate++;
1541: low = i + 1;
1542: high++;
1543: nrow++;
1544: noinsert:;
1545: }
1546: a->rlen[row] = nrow;
1547: }
1548: PetscFunctionReturn(PETSC_SUCCESS);
1549: }
1551: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1552: {
1553: PetscFunctionBegin;
1554: /* If the two matrices have the same copy implementation, use fast copy. */
1555: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1556: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1557: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1559: PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1560: PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1561: } else {
1562: PetscCall(MatCopy_Basic(A, B, str));
1563: }
1564: PetscFunctionReturn(PETSC_SUCCESS);
1565: }
1567: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1568: {
1569: PetscFunctionBegin;
1570: PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1571: PetscFunctionReturn(PETSC_SUCCESS);
1572: }
1574: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1575: {
1576: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1578: PetscFunctionBegin;
1579: *array = a->val;
1580: PetscFunctionReturn(PETSC_SUCCESS);
1581: }
1583: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1584: {
1585: PetscFunctionBegin;
1586: PetscFunctionReturn(PETSC_SUCCESS);
1587: }
1589: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1590: {
1591: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1592: PetscInt i;
1593: MatScalar *aval = a->val;
1595: PetscFunctionBegin;
1596: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1597: PetscFunctionReturn(PETSC_SUCCESS);
1598: }
1600: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1601: {
1602: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1603: PetscInt i;
1604: MatScalar *aval = a->val;
1606: PetscFunctionBegin;
1607: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1608: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1609: PetscFunctionReturn(PETSC_SUCCESS);
1610: }
1612: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1613: {
1614: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1615: MatScalar *aval = a->val;
1616: PetscScalar oalpha = alpha;
1617: PetscBLASInt one = 1, size;
1619: PetscFunctionBegin;
1620: PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1621: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1622: PetscCall(PetscLogFlops(a->nz));
1623: PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1624: PetscFunctionReturn(PETSC_SUCCESS);
1625: }
1627: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1628: {
1629: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1631: PetscFunctionBegin;
1632: if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1633: PetscCall(MatShift_Basic(Y, a));
1634: PetscFunctionReturn(PETSC_SUCCESS);
1635: }
1637: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1638: {
1639: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1640: PetscScalar *x, sum, *t;
1641: const MatScalar *idiag = NULL, *mdiag;
1642: const PetscScalar *b, *xb;
1643: PetscInt n, m = A->rmap->n, i, j, shift;
1644: const PetscInt *diag;
1646: PetscFunctionBegin;
1647: its = its * lits;
1649: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1650: if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1651: a->fshift = fshift;
1652: a->omega = omega;
1654: diag = a->diag;
1655: t = a->ssor_work;
1656: idiag = a->idiag;
1657: mdiag = a->mdiag;
1659: PetscCall(VecGetArray(xx, &x));
1660: PetscCall(VecGetArrayRead(bb, &b));
1661: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1662: PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1663: PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1664: PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1666: if (flag & SOR_ZERO_INITIAL_GUESS) {
1667: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1668: for (i = 0; i < m; i++) {
1669: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1670: sum = b[i];
1671: n = (diag[i] - shift) / 8;
1672: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1673: t[i] = sum;
1674: x[i] = sum * idiag[i];
1675: }
1676: xb = t;
1677: PetscCall(PetscLogFlops(a->nz));
1678: } else xb = b;
1679: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1680: for (i = m - 1; i >= 0; i--) {
1681: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1682: sum = xb[i];
1683: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1684: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1685: if (xb == b) {
1686: x[i] = sum * idiag[i];
1687: } else {
1688: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1689: }
1690: }
1691: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1692: }
1693: its--;
1694: }
1695: while (its--) {
1696: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1697: for (i = 0; i < m; i++) {
1698: /* lower */
1699: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1700: sum = b[i];
1701: n = (diag[i] - shift) / 8;
1702: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1703: t[i] = sum; /* save application of the lower-triangular part */
1704: /* upper */
1705: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1706: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1707: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1708: }
1709: xb = t;
1710: PetscCall(PetscLogFlops(2.0 * a->nz));
1711: } else xb = b;
1712: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1713: for (i = m - 1; i >= 0; i--) {
1714: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1715: sum = xb[i];
1716: if (xb == b) {
1717: /* whole matrix (no checkpointing available) */
1718: n = a->rlen[i];
1719: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1720: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1721: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1722: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1723: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1724: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1725: }
1726: }
1727: if (xb == b) {
1728: PetscCall(PetscLogFlops(2.0 * a->nz));
1729: } else {
1730: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1731: }
1732: }
1733: }
1734: PetscCall(VecRestoreArray(xx, &x));
1735: PetscCall(VecRestoreArrayRead(bb, &b));
1736: PetscFunctionReturn(PETSC_SUCCESS);
1737: }
1739: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1740: MatGetRow_SeqSELL,
1741: MatRestoreRow_SeqSELL,
1742: MatMult_SeqSELL,
1743: /* 4*/ MatMultAdd_SeqSELL,
1744: MatMultTranspose_SeqSELL,
1745: MatMultTransposeAdd_SeqSELL,
1746: NULL,
1747: NULL,
1748: NULL,
1749: /* 10*/ NULL,
1750: NULL,
1751: NULL,
1752: MatSOR_SeqSELL,
1753: NULL,
1754: /* 15*/ MatGetInfo_SeqSELL,
1755: MatEqual_SeqSELL,
1756: MatGetDiagonal_SeqSELL,
1757: MatDiagonalScale_SeqSELL,
1758: NULL,
1759: /* 20*/ NULL,
1760: MatAssemblyEnd_SeqSELL,
1761: MatSetOption_SeqSELL,
1762: MatZeroEntries_SeqSELL,
1763: /* 24*/ NULL,
1764: NULL,
1765: NULL,
1766: NULL,
1767: NULL,
1768: /* 29*/ MatSetUp_SeqSELL,
1769: NULL,
1770: NULL,
1771: NULL,
1772: NULL,
1773: /* 34*/ MatDuplicate_SeqSELL,
1774: NULL,
1775: NULL,
1776: NULL,
1777: NULL,
1778: /* 39*/ NULL,
1779: NULL,
1780: NULL,
1781: MatGetValues_SeqSELL,
1782: MatCopy_SeqSELL,
1783: /* 44*/ NULL,
1784: MatScale_SeqSELL,
1785: MatShift_SeqSELL,
1786: NULL,
1787: NULL,
1788: /* 49*/ NULL,
1789: NULL,
1790: NULL,
1791: NULL,
1792: NULL,
1793: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1794: NULL,
1795: NULL,
1796: NULL,
1797: NULL,
1798: /* 59*/ NULL,
1799: MatDestroy_SeqSELL,
1800: MatView_SeqSELL,
1801: NULL,
1802: NULL,
1803: /* 64*/ NULL,
1804: NULL,
1805: NULL,
1806: NULL,
1807: NULL,
1808: /* 69*/ NULL,
1809: NULL,
1810: NULL,
1811: NULL,
1812: NULL,
1813: /* 74*/ NULL,
1814: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1815: NULL,
1816: NULL,
1817: NULL,
1818: /* 79*/ NULL,
1819: NULL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: /* 84*/ NULL,
1824: NULL,
1825: NULL,
1826: NULL,
1827: NULL,
1828: /* 89*/ NULL,
1829: NULL,
1830: NULL,
1831: NULL,
1832: NULL,
1833: /* 94*/ NULL,
1834: NULL,
1835: NULL,
1836: NULL,
1837: NULL,
1838: /* 99*/ NULL,
1839: NULL,
1840: NULL,
1841: MatConjugate_SeqSELL,
1842: NULL,
1843: /*104*/ NULL,
1844: NULL,
1845: NULL,
1846: NULL,
1847: NULL,
1848: /*109*/ NULL,
1849: NULL,
1850: NULL,
1851: NULL,
1852: MatMissingDiagonal_SeqSELL,
1853: /*114*/ NULL,
1854: NULL,
1855: NULL,
1856: NULL,
1857: NULL,
1858: /*119*/ NULL,
1859: NULL,
1860: NULL,
1861: NULL,
1862: NULL,
1863: /*124*/ NULL,
1864: NULL,
1865: NULL,
1866: NULL,
1867: NULL,
1868: /*129*/ NULL,
1869: NULL,
1870: NULL,
1871: NULL,
1872: NULL,
1873: /*134*/ NULL,
1874: NULL,
1875: NULL,
1876: NULL,
1877: NULL,
1878: /*139*/ NULL,
1879: NULL,
1880: NULL,
1881: MatFDColoringSetUp_SeqXAIJ,
1882: NULL,
1883: /*144*/ NULL,
1884: NULL,
1885: NULL,
1886: NULL,
1887: NULL,
1888: NULL,
1889: /*150*/ NULL,
1890: NULL};
1892: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1893: {
1894: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1896: PetscFunctionBegin;
1897: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1899: /* allocate space for values if not already there */
1900: if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1902: /* copy values over */
1903: PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1904: PetscFunctionReturn(PETSC_SUCCESS);
1905: }
1907: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1908: {
1909: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1911: PetscFunctionBegin;
1912: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1913: PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1914: PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1915: PetscFunctionReturn(PETSC_SUCCESS);
1916: }
1918: /*@C
1919: MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1921: Not Collective
1923: Input Parameters:
1924: . mat - a `MATSEQSELL` matrix
1925: . array - pointer to the data
1927: Level: intermediate
1929: .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1930: @*/
1931: PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1932: {
1933: PetscFunctionBegin;
1934: PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1935: PetscFunctionReturn(PETSC_SUCCESS);
1936: }
1938: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1939: {
1940: Mat_SeqSELL *b;
1941: PetscMPIInt size;
1943: PetscFunctionBegin;
1944: PetscCall(PetscCitationsRegister(citation, &cited));
1945: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1946: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1948: PetscCall(PetscNew(&b));
1950: B->data = (void *)b;
1952: PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1954: b->row = NULL;
1955: b->col = NULL;
1956: b->icol = NULL;
1957: b->reallocs = 0;
1958: b->ignorezeroentries = PETSC_FALSE;
1959: b->roworiented = PETSC_TRUE;
1960: b->nonew = 0;
1961: b->diag = NULL;
1962: b->solve_work = NULL;
1963: B->spptr = NULL;
1964: b->saved_values = NULL;
1965: b->idiag = NULL;
1966: b->mdiag = NULL;
1967: b->ssor_work = NULL;
1968: b->omega = 1.0;
1969: b->fshift = 0.0;
1970: b->idiagvalid = PETSC_FALSE;
1971: b->keepnonzeropattern = PETSC_FALSE;
1973: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1974: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1975: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1976: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1977: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1978: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1979: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1980: PetscFunctionReturn(PETSC_SUCCESS);
1981: }
1983: /*
1984: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1985: */
1986: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1987: {
1988: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1989: PetscInt i, m = A->rmap->n;
1990: PetscInt totalslices = a->totalslices;
1992: PetscFunctionBegin;
1993: C->factortype = A->factortype;
1994: c->row = NULL;
1995: c->col = NULL;
1996: c->icol = NULL;
1997: c->reallocs = 0;
1998: C->assembled = PETSC_TRUE;
2000: PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2001: PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2003: PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
2004: PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2006: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2007: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2009: /* allocate the matrix space */
2010: if (mallocmatspace) {
2011: PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2013: c->singlemalloc = PETSC_TRUE;
2015: if (m > 0) {
2016: PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2017: if (cpvalues == MAT_COPY_VALUES) {
2018: PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2019: } else {
2020: PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2021: }
2022: }
2023: }
2025: c->ignorezeroentries = a->ignorezeroentries;
2026: c->roworiented = a->roworiented;
2027: c->nonew = a->nonew;
2028: if (a->diag) {
2029: PetscCall(PetscMalloc1(m, &c->diag));
2030: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2031: } else c->diag = NULL;
2033: c->solve_work = NULL;
2034: c->saved_values = NULL;
2035: c->idiag = NULL;
2036: c->ssor_work = NULL;
2037: c->keepnonzeropattern = a->keepnonzeropattern;
2038: c->free_val = PETSC_TRUE;
2039: c->free_colidx = PETSC_TRUE;
2041: c->maxallocmat = a->maxallocmat;
2042: c->maxallocrow = a->maxallocrow;
2043: c->rlenmax = a->rlenmax;
2044: c->nz = a->nz;
2045: C->preallocated = PETSC_TRUE;
2047: c->nonzerorowcnt = a->nonzerorowcnt;
2048: C->nonzerostate = A->nonzerostate;
2050: PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2051: PetscFunctionReturn(PETSC_SUCCESS);
2052: }
2054: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2055: {
2056: PetscFunctionBegin;
2057: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2058: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2059: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2060: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2061: PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2062: PetscFunctionReturn(PETSC_SUCCESS);
2063: }
2065: /*MC
2066: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2067: based on the sliced Ellpack format
2069: Options Database Keys:
2070: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2072: Level: beginner
2074: .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2075: M*/
2077: /*MC
2078: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2080: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2081: and `MATMPISELL` otherwise. As a result, for single process communicators,
2082: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2083: for communicators controlling multiple processes. It is recommended that you call both of
2084: the above preallocation routines for simplicity.
2086: Options Database Keys:
2087: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2089: Level: beginner
2091: Notes:
2092: This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2094: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2095: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2097: Developer Notes:
2098: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2100: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2101: .vb
2102: (2 0 3 4)
2103: Consider the matrix A = (5 0 6 0)
2104: (0 0 7 8)
2105: (0 0 9 9)
2107: symbolically the Ellpack format can be written as
2109: (2 3 4 |) (0 2 3 |)
2110: v = (5 6 0 |) colidx = (0 2 2 |)
2111: -------- ---------
2112: (7 8 |) (2 3 |)
2113: (9 9 |) (2 3 |)
2115: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2116: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2117: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2119: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2121: .ve
2123: See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2125: References:
2126: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2127: Proceedings of the 47th International Conference on Parallel Processing, 2018.
2129: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2130: M*/
2132: /*@C
2133: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2135: Collective
2137: Input Parameters:
2138: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2139: . m - number of rows
2140: . n - number of columns
2141: . rlenmax - maximum number of nonzeros in a row
2142: - rlen - array containing the number of nonzeros in the various rows
2143: (possibly different for each row) or NULL
2145: Output Parameter:
2146: . A - the matrix
2148: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2149: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2150: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2152: Notes:
2153: If nnz is given then nz is ignored
2155: Specify the preallocated storage with either rlenmax or rlen (not both).
2156: Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2157: allocation. For large problems you MUST preallocate memory or you
2158: will get TERRIBLE performance, see the users' manual chapter on matrices.
2160: Level: intermediate
2162: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2163: @*/
2164: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2165: {
2166: PetscFunctionBegin;
2167: PetscCall(MatCreate(comm, A));
2168: PetscCall(MatSetSizes(*A, m, n, m, n));
2169: PetscCall(MatSetType(*A, MATSEQSELL));
2170: PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2171: PetscFunctionReturn(PETSC_SUCCESS);
2172: }
2174: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2175: {
2176: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2177: PetscInt totalslices = a->totalslices;
2179: PetscFunctionBegin;
2180: /* If the matrix dimensions are not equal,or no of nonzeros */
2181: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2182: *flg = PETSC_FALSE;
2183: PetscFunctionReturn(PETSC_SUCCESS);
2184: }
2185: /* if the a->colidx are the same */
2186: PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2187: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2188: /* if a->val are the same */
2189: PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2190: PetscFunctionReturn(PETSC_SUCCESS);
2191: }
2193: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2194: {
2195: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2197: PetscFunctionBegin;
2198: a->idiagvalid = PETSC_FALSE;
2199: PetscFunctionReturn(PETSC_SUCCESS);
2200: }
2202: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2203: {
2204: #if defined(PETSC_USE_COMPLEX)
2205: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2206: PetscInt i;
2207: PetscScalar *val = a->val;
2209: PetscFunctionBegin;
2210: for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2211: #else
2212: PetscFunctionBegin;
2213: #endif
2214: PetscFunctionReturn(PETSC_SUCCESS);
2215: }