Actual source code: aijkok.kokkos.cxx
1: #include <petscvec_kokkos.hpp>
2: #include <petscpkg_version.h>
3: #include <petsc/private/petscimpl.h>
4: #include <petsc/private/sfimpl.h>
5: #include <petscsystypes.h>
6: #include <petscerror.h>
8: #include <Kokkos_Core.hpp>
9: #include <KokkosBlas.hpp>
10: #include <KokkosSparse_CrsMatrix.hpp>
11: #include <KokkosSparse_spmv.hpp>
12: #include <KokkosSparse_spiluk.hpp>
13: #include <KokkosSparse_sptrsv.hpp>
14: #include <KokkosSparse_spgemm.hpp>
15: #include <KokkosSparse_spadd.hpp>
17: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20: #include <KokkosSparse_Utils.hpp>
21: using KokkosSparse::sort_crs_matrix;
22: using KokkosSparse::Impl::transpose_matrix;
23: #else
24: #include <KokkosKernels_Sorting.hpp>
25: using KokkosKernels::sort_crs_matrix;
26: using KokkosKernels::Impl::transpose_matrix;
27: #endif
29: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
31: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33: In the latter case, it is important to set a_dual's sync state correctly.
34: */
35: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36: {
37: Mat_SeqAIJ *aijseq;
38: Mat_SeqAIJKokkos *aijkok;
40: PetscFunctionBegin;
41: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
44: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
47: /* If aijkok does not exist, we just copy i, j to device.
48: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49: In both cases, we build a new aijkok structure.
50: */
51: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52: delete aijkok;
53: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54: A->spptr = aijkok;
55: }
57: if (aijkok->device_mat_d.data()) {
58: A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59: }
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: /* Sync CSR data to device if not yet */
64: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65: {
66: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
68: PetscFunctionBegin;
69: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
70: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71: if (aijkok->a_dual.need_sync_device()) {
72: aijkok->a_dual.sync_device();
73: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74: aijkok->hermitian_updated = PETSC_FALSE;
75: }
76: PetscFunctionReturn(PETSC_SUCCESS);
77: }
79: /* Mark the CSR data on device as modified */
80: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81: {
82: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
84: PetscFunctionBegin;
85: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86: aijkok->a_dual.clear_sync_state();
87: aijkok->a_dual.modify_device();
88: aijkok->transpose_updated = PETSC_FALSE;
89: aijkok->hermitian_updated = PETSC_FALSE;
90: PetscCall(MatSeqAIJInvalidateDiagonal(A));
91: PetscCall(PetscObjectStateIncrease((PetscObject)A));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96: {
97: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
99: PetscFunctionBegin;
100: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101: /* We do not expect one needs factors on host */
102: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
103: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104: aijkok->a_dual.sync_host();
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
108: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112: PetscFunctionBegin;
113: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117: */
118: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119: aijkok->a_dual.sync_host();
120: *array = aijkok->a_dual.view_host().data();
121: } else { /* Happens when calling MatSetValues on a newly created matrix */
122: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123: }
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128: {
129: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
131: PetscFunctionBegin;
132: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133: PetscFunctionReturn(PETSC_SUCCESS);
134: }
136: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137: {
138: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
140: PetscFunctionBegin;
141: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142: aijkok->a_dual.sync_host();
143: *array = aijkok->a_dual.view_host().data();
144: } else {
145: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146: }
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151: {
152: PetscFunctionBegin;
153: *array = NULL;
154: PetscFunctionReturn(PETSC_SUCCESS);
155: }
157: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158: {
159: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
161: PetscFunctionBegin;
162: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163: *array = aijkok->a_dual.view_host().data();
164: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166: }
167: PetscFunctionReturn(PETSC_SUCCESS);
168: }
170: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171: {
172: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
174: PetscFunctionBegin;
175: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176: aijkok->a_dual.clear_sync_state();
177: aijkok->a_dual.modify_host();
178: }
179: PetscFunctionReturn(PETSC_SUCCESS);
180: }
182: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183: {
184: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
186: PetscFunctionBegin;
187: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
189: if (i) *i = aijkok->i_device_data();
190: if (j) *j = aijkok->j_device_data();
191: if (a) {
192: aijkok->a_dual.sync_device();
193: *a = aijkok->a_device_data();
194: }
195: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200: PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201: {
202: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203: Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
205: PetscFunctionBegin;
206: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207: aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208: Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209: PetscFunctionReturn(PETSC_SUCCESS);
210: }
212: // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213: PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214: {
215: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217: PetscFunctionBegin;
218: if (aijkok && aijkok->device_mat_d.data()) {
219: *d_mat = aijkok->device_mat_d.data();
220: } else {
221: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222: *d_mat = NULL;
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: /* Generate the transpose on device and cache it internally */
228: static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229: {
230: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
232: PetscFunctionBegin;
233: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234: if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235: /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
236: PetscCallCXX(aijkok->a_dual.sync_device());
237: PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238: PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
239: aijkok->transpose_updated = PETSC_TRUE;
240: }
241: *csrmatT = &aijkok->csrmatT;
242: PetscFunctionReturn(PETSC_SUCCESS);
243: }
245: /* Generate the Hermitian on device and cache it internally */
246: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247: {
248: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
250: PetscFunctionBegin;
251: PetscCall(PetscLogGpuTimeBegin());
252: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253: if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
254: PetscCallCXX(aijkok->a_dual.sync_device());
255: PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256: PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257: #if defined(PETSC_USE_COMPLEX)
258: const auto &a = aijkok->csrmatH.values;
259: Kokkos::parallel_for(
260: a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
261: #endif
262: aijkok->hermitian_updated = PETSC_TRUE;
263: }
264: *csrmatH = &aijkok->csrmatH;
265: PetscCall(PetscLogGpuTimeEnd());
266: PetscFunctionReturn(PETSC_SUCCESS);
267: }
269: /* y = A x */
270: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
271: {
272: Mat_SeqAIJKokkos *aijkok;
273: ConstPetscScalarKokkosView xv;
274: PetscScalarKokkosView yv;
276: PetscFunctionBegin;
277: PetscCall(PetscLogGpuTimeBegin());
278: PetscCall(MatSeqAIJKokkosSyncDevice(A));
279: PetscCall(VecGetKokkosView(xx, &xv));
280: PetscCall(VecGetKokkosViewWrite(yy, &yv));
281: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
282: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
283: PetscCall(VecRestoreKokkosView(xx, &xv));
284: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
285: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
286: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
287: PetscCall(PetscLogGpuTimeEnd());
288: PetscFunctionReturn(PETSC_SUCCESS);
289: }
291: /* y = A^T x */
292: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
293: {
294: Mat_SeqAIJKokkos *aijkok;
295: const char *mode;
296: ConstPetscScalarKokkosView xv;
297: PetscScalarKokkosView yv;
298: KokkosCsrMatrix *csrmat;
300: PetscFunctionBegin;
301: PetscCall(PetscLogGpuTimeBegin());
302: PetscCall(MatSeqAIJKokkosSyncDevice(A));
303: PetscCall(VecGetKokkosView(xx, &xv));
304: PetscCall(VecGetKokkosViewWrite(yy, &yv));
305: if (A->form_explicit_transpose) {
306: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
307: mode = "N";
308: } else {
309: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310: csrmat = &aijkok->csrmat;
311: mode = "T";
312: }
313: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
314: PetscCall(VecRestoreKokkosView(xx, &xv));
315: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
316: PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
317: PetscCall(PetscLogGpuTimeEnd());
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }
321: /* y = A^H x */
322: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
323: {
324: Mat_SeqAIJKokkos *aijkok;
325: const char *mode;
326: ConstPetscScalarKokkosView xv;
327: PetscScalarKokkosView yv;
328: KokkosCsrMatrix *csrmat;
330: PetscFunctionBegin;
331: PetscCall(PetscLogGpuTimeBegin());
332: PetscCall(MatSeqAIJKokkosSyncDevice(A));
333: PetscCall(VecGetKokkosView(xx, &xv));
334: PetscCall(VecGetKokkosViewWrite(yy, &yv));
335: if (A->form_explicit_transpose) {
336: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
337: mode = "N";
338: } else {
339: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
340: csrmat = &aijkok->csrmat;
341: mode = "C";
342: }
343: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
344: PetscCall(VecRestoreKokkosView(xx, &xv));
345: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
346: PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
347: PetscCall(PetscLogGpuTimeEnd());
348: PetscFunctionReturn(PETSC_SUCCESS);
349: }
351: /* z = A x + y */
352: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
353: {
354: Mat_SeqAIJKokkos *aijkok;
355: ConstPetscScalarKokkosView xv, yv;
356: PetscScalarKokkosView zv;
358: PetscFunctionBegin;
359: PetscCall(PetscLogGpuTimeBegin());
360: PetscCall(MatSeqAIJKokkosSyncDevice(A));
361: PetscCall(VecGetKokkosView(xx, &xv));
362: PetscCall(VecGetKokkosView(yy, &yv));
363: PetscCall(VecGetKokkosViewWrite(zz, &zv));
364: if (zz != yy) Kokkos::deep_copy(zv, yv);
365: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
366: PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
367: PetscCall(VecRestoreKokkosView(xx, &xv));
368: PetscCall(VecRestoreKokkosView(yy, &yv));
369: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
370: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
371: PetscCall(PetscLogGpuTimeEnd());
372: PetscFunctionReturn(PETSC_SUCCESS);
373: }
375: /* z = A^T x + y */
376: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
377: {
378: Mat_SeqAIJKokkos *aijkok;
379: const char *mode;
380: ConstPetscScalarKokkosView xv, yv;
381: PetscScalarKokkosView zv;
382: KokkosCsrMatrix *csrmat;
384: PetscFunctionBegin;
385: PetscCall(PetscLogGpuTimeBegin());
386: PetscCall(MatSeqAIJKokkosSyncDevice(A));
387: PetscCall(VecGetKokkosView(xx, &xv));
388: PetscCall(VecGetKokkosView(yy, &yv));
389: PetscCall(VecGetKokkosViewWrite(zz, &zv));
390: if (zz != yy) Kokkos::deep_copy(zv, yv);
391: if (A->form_explicit_transpose) {
392: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
393: mode = "N";
394: } else {
395: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396: csrmat = &aijkok->csrmat;
397: mode = "T";
398: }
399: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
400: PetscCall(VecRestoreKokkosView(xx, &xv));
401: PetscCall(VecRestoreKokkosView(yy, &yv));
402: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
403: PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
404: PetscCall(PetscLogGpuTimeEnd());
405: PetscFunctionReturn(PETSC_SUCCESS);
406: }
408: /* z = A^H x + y */
409: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
410: {
411: Mat_SeqAIJKokkos *aijkok;
412: const char *mode;
413: ConstPetscScalarKokkosView xv, yv;
414: PetscScalarKokkosView zv;
415: KokkosCsrMatrix *csrmat;
417: PetscFunctionBegin;
418: PetscCall(PetscLogGpuTimeBegin());
419: PetscCall(MatSeqAIJKokkosSyncDevice(A));
420: PetscCall(VecGetKokkosView(xx, &xv));
421: PetscCall(VecGetKokkosView(yy, &yv));
422: PetscCall(VecGetKokkosViewWrite(zz, &zv));
423: if (zz != yy) Kokkos::deep_copy(zv, yv);
424: if (A->form_explicit_transpose) {
425: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
426: mode = "N";
427: } else {
428: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
429: csrmat = &aijkok->csrmat;
430: mode = "C";
431: }
432: PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
433: PetscCall(VecRestoreKokkosView(xx, &xv));
434: PetscCall(VecRestoreKokkosView(yy, &yv));
435: PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436: PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
437: PetscCall(PetscLogGpuTimeEnd());
438: PetscFunctionReturn(PETSC_SUCCESS);
439: }
441: PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
442: {
443: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
445: PetscFunctionBegin;
446: switch (op) {
447: case MAT_FORM_EXPLICIT_TRANSPOSE:
448: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
449: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
450: A->form_explicit_transpose = flg;
451: break;
452: default:
453: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
454: break;
455: }
456: PetscFunctionReturn(PETSC_SUCCESS);
457: }
459: /* Depending on reuse, either build a new mat, or use the existing mat */
460: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
461: {
462: Mat_SeqAIJ *aseq;
464: PetscFunctionBegin;
465: PetscCall(PetscKokkosInitializeCheck());
466: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
467: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
468: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
469: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
470: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
471: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
472: PetscCall(PetscFree(A->defaultvectype));
473: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
474: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
475: PetscCall(MatSetOps_SeqAIJKokkos(A));
476: aseq = static_cast<Mat_SeqAIJ *>(A->data);
477: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
478: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
479: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
480: }
481: }
482: PetscFunctionReturn(PETSC_SUCCESS);
483: }
485: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
486: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
487: */
488: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
489: {
490: Mat_SeqAIJ *bseq;
491: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
492: Mat mat;
494: PetscFunctionBegin;
495: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
496: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
497: mat = *B;
498: if (A->assembled) {
499: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
500: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
501: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
502: /* Now copy values to B if needed */
503: if (dupOption == MAT_COPY_VALUES) {
504: if (akok->a_dual.need_sync_device()) {
505: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
506: bkok->a_dual.modify_host();
507: } else { /* If device has the latest data, we only copy data on device */
508: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
509: bkok->a_dual.modify_device();
510: }
511: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
512: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
513: bkok->a_dual.modify_host();
514: }
515: mat->spptr = bkok;
516: }
518: PetscCall(PetscFree(mat->defaultvectype));
519: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
520: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
521: PetscCall(MatSetOps_SeqAIJKokkos(mat));
522: PetscFunctionReturn(PETSC_SUCCESS);
523: }
525: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
526: {
527: Mat At;
528: KokkosCsrMatrix *internT;
529: Mat_SeqAIJKokkos *atkok, *bkok;
531: PetscFunctionBegin;
532: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
533: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
534: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
535: /* Deep copy internT, as we want to isolate the internal transpose */
536: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
537: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
538: if (reuse == MAT_INITIAL_MATRIX) *B = At;
539: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
540: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
541: if ((*B)->assembled) {
542: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
543: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
544: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
545: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
546: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
547: MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
548: MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
549: PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
550: PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
551: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
552: }
553: PetscFunctionReturn(PETSC_SUCCESS);
554: }
556: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
557: {
558: Mat_SeqAIJKokkos *aijkok;
560: PetscFunctionBegin;
561: if (A->factortype == MAT_FACTOR_NONE) {
562: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
563: delete aijkok;
564: } else {
565: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
566: }
567: A->spptr = NULL;
568: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
569: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
570: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
571: PetscCall(MatDestroy_SeqAIJ(A));
572: PetscFunctionReturn(PETSC_SUCCESS);
573: }
575: /*MC
576: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
578: A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
580: Options Database Key:
581: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
583: Level: beginner
585: .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
586: M*/
587: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
588: {
589: PetscFunctionBegin;
590: PetscCall(PetscKokkosInitializeCheck());
591: PetscCall(MatCreate_SeqAIJ(A));
592: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
593: PetscFunctionReturn(PETSC_SUCCESS);
594: }
596: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
597: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
598: {
599: Mat_SeqAIJ *a, *b;
600: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
601: MatScalarKokkosView aa, ba, ca;
602: MatRowMapKokkosView ai, bi, ci;
603: MatColIdxKokkosView aj, bj, cj;
604: PetscInt m, n, nnz, aN;
606: PetscFunctionBegin;
610: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
611: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
612: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
613: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
615: PetscCall(MatSeqAIJKokkosSyncDevice(A));
616: PetscCall(MatSeqAIJKokkosSyncDevice(B));
617: a = static_cast<Mat_SeqAIJ *>(A->data);
618: b = static_cast<Mat_SeqAIJ *>(B->data);
619: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
620: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
621: aa = akok->a_dual.view_device();
622: ai = akok->i_dual.view_device();
623: ba = bkok->a_dual.view_device();
624: bi = bkok->i_dual.view_device();
625: m = A->rmap->n; /* M, N and nnz of C */
626: n = A->cmap->n + B->cmap->n;
627: nnz = a->nz + b->nz;
628: aN = A->cmap->n; /* N of A */
629: if (reuse == MAT_INITIAL_MATRIX) {
630: aj = akok->j_dual.view_device();
631: bj = bkok->j_dual.view_device();
632: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
633: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
634: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
635: ca = ca_dual.view_device();
636: ci = ci_dual.view_device();
637: cj = cj_dual.view_device();
639: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
640: Kokkos::parallel_for(
641: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
642: PetscInt i = t.league_rank(); /* row i */
643: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
645: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
646: ci(i) = coffset;
647: if (i == m - 1) ci(m) = ai(m) + bi(m);
648: });
650: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
651: if (k < alen) {
652: ca(coffset + k) = aa(ai(i) + k);
653: cj(coffset + k) = aj(ai(i) + k);
654: } else {
655: ca(coffset + k) = ba(bi(i) + k - alen);
656: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
657: }
658: });
659: });
660: ca_dual.modify_device();
661: ci_dual.modify_device();
662: cj_dual.modify_device();
663: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
664: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
665: } else if (reuse == MAT_REUSE_MATRIX) {
667: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
668: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
669: ca = ckok->a_dual.view_device();
670: ci = ckok->i_dual.view_device();
672: Kokkos::parallel_for(
673: Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674: PetscInt i = t.league_rank(); /* row i */
675: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
676: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
677: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
678: else ca(ci(i) + k) = ba(bi(i) + k - alen);
679: });
680: });
681: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
682: }
683: PetscFunctionReturn(PETSC_SUCCESS);
684: }
686: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
687: {
688: PetscFunctionBegin;
689: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
690: PetscFunctionReturn(PETSC_SUCCESS);
691: }
693: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
694: {
695: Mat_Product *product = C->product;
696: Mat A, B;
697: bool transA, transB; /* use bool, since KK needs this type */
698: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
699: Mat_SeqAIJ *c;
700: MatProductData_SeqAIJKokkos *pdata;
701: KokkosCsrMatrix *csrmatA, *csrmatB;
703: PetscFunctionBegin;
704: MatCheckProduct(C, 1);
705: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
706: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
708: if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
709: pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */
710: PetscFunctionReturn(PETSC_SUCCESS);
711: }
713: switch (product->type) {
714: case MATPRODUCT_AB:
715: transA = false;
716: transB = false;
717: break;
718: case MATPRODUCT_AtB:
719: transA = true;
720: transB = false;
721: break;
722: case MATPRODUCT_ABt:
723: transA = false;
724: transB = true;
725: break;
726: default:
727: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
728: }
730: A = product->A;
731: B = product->B;
732: PetscCall(MatSeqAIJKokkosSyncDevice(A));
733: PetscCall(MatSeqAIJKokkosSyncDevice(B));
734: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
735: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
736: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
738: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
740: csrmatA = &akok->csrmat;
741: csrmatB = &bkok->csrmat;
743: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
744: if (transA) {
745: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
746: transA = false;
747: }
749: if (transB) {
750: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
751: transB = false;
752: }
753: PetscCall(PetscLogGpuTimeBegin());
754: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
755: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
756: auto spgemmHandle = pdata->kh.get_spgemm_handle();
757: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
758: #endif
760: PetscCall(PetscLogGpuTimeEnd());
761: PetscCall(MatSeqAIJKokkosModifyDevice(C));
762: /* shorter version of MatAssemblyEnd_SeqAIJ */
763: c = (Mat_SeqAIJ *)C->data;
764: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
765: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
766: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
767: c->reallocs = 0;
768: C->info.mallocs = 0;
769: C->info.nz_unneeded = 0;
770: C->assembled = C->was_assembled = PETSC_TRUE;
771: C->num_ass++;
772: PetscFunctionReturn(PETSC_SUCCESS);
773: }
775: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
776: {
777: Mat_Product *product = C->product;
778: MatProductType ptype;
779: Mat A, B;
780: bool transA, transB;
781: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
782: MatProductData_SeqAIJKokkos *pdata;
783: MPI_Comm comm;
784: KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC;
786: PetscFunctionBegin;
787: MatCheckProduct(C, 1);
788: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
789: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
790: A = product->A;
791: B = product->B;
792: PetscCall(MatSeqAIJKokkosSyncDevice(A));
793: PetscCall(MatSeqAIJKokkosSyncDevice(B));
794: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
795: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
796: csrmatA = &akok->csrmat;
797: csrmatB = &bkok->csrmat;
799: ptype = product->type;
800: switch (ptype) {
801: case MATPRODUCT_AB:
802: transA = false;
803: transB = false;
804: break;
805: case MATPRODUCT_AtB:
806: transA = true;
807: transB = false;
808: break;
809: case MATPRODUCT_ABt:
810: transA = false;
811: transB = true;
812: break;
813: default:
814: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
815: }
817: product->data = pdata = new MatProductData_SeqAIJKokkos();
818: pdata->kh.set_team_work_size(16);
819: pdata->kh.set_dynamic_scheduling(true);
820: pdata->reusesym = product->api_user;
822: /* TODO: add command line options to select spgemm algorithms */
823: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
825: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
826: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
827: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
828: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
829: #endif
830: #endif
832: pdata->kh.create_spgemm_handle(spgemm_alg);
834: PetscCall(PetscLogGpuTimeBegin());
835: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
836: if (transA) {
837: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
838: transA = false;
839: }
841: if (transB) {
842: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
843: transB = false;
844: }
846: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
848: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
849: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
850: calling new Mat_SeqAIJKokkos().
851: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
852: */
853: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
854: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
855: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
856: auto spgemmHandle = pdata->kh.get_spgemm_handle();
857: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
858: #endif
859: PetscCall(PetscLogGpuTimeEnd());
861: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
862: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
863: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
864: PetscFunctionReturn(PETSC_SUCCESS);
865: }
867: /* handles sparse matrix matrix ops */
868: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
869: {
870: Mat_Product *product = mat->product;
871: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
873: PetscFunctionBegin;
874: MatCheckProduct(mat, 1);
875: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
876: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
877: if (Biskok && Ciskok) {
878: switch (product->type) {
879: case MATPRODUCT_AB:
880: case MATPRODUCT_AtB:
881: case MATPRODUCT_ABt:
882: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
883: break;
884: case MATPRODUCT_PtAP:
885: case MATPRODUCT_RARt:
886: case MATPRODUCT_ABC:
887: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
888: break;
889: default:
890: break;
891: }
892: } else { /* fallback for AIJ */
893: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
894: }
895: PetscFunctionReturn(PETSC_SUCCESS);
896: }
898: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
899: {
900: Mat_SeqAIJKokkos *aijkok;
902: PetscFunctionBegin;
903: PetscCall(PetscLogGpuTimeBegin());
904: PetscCall(MatSeqAIJKokkosSyncDevice(A));
905: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906: KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
907: PetscCall(MatSeqAIJKokkosModifyDevice(A));
908: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
909: PetscCall(PetscLogGpuTimeEnd());
910: PetscFunctionReturn(PETSC_SUCCESS);
911: }
913: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
914: {
915: Mat_SeqAIJKokkos *aijkok;
917: PetscFunctionBegin;
918: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
919: if (aijkok) { /* Only zero the device if data is already there */
920: KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
921: PetscCall(MatSeqAIJKokkosModifyDevice(A));
922: } else { /* Might be preallocated but not assembled */
923: PetscCall(MatZeroEntries_SeqAIJ(A));
924: }
925: PetscFunctionReturn(PETSC_SUCCESS);
926: }
928: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
929: {
930: Mat_SeqAIJ *aijseq;
931: Mat_SeqAIJKokkos *aijkok;
932: PetscInt n;
933: PetscScalarKokkosView xv;
935: PetscFunctionBegin;
936: PetscCall(VecGetLocalSize(x, &n));
937: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
938: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
940: PetscCall(MatSeqAIJKokkosSyncDevice(A));
941: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
943: if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
944: PetscCall(MatMarkDiagonal_SeqAIJ(A));
945: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
946: aijkok->SetDiagonal(aijseq->diag);
947: }
949: const auto &Aa = aijkok->a_dual.view_device();
950: const auto &Ai = aijkok->i_dual.view_device();
951: const auto &Adiag = aijkok->diag_dual.view_device();
953: PetscCall(VecGetKokkosViewWrite(x, &xv));
954: Kokkos::parallel_for(
955: n, KOKKOS_LAMBDA(const PetscInt i) {
956: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
957: else xv(i) = 0;
958: });
959: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
960: PetscFunctionReturn(PETSC_SUCCESS);
961: }
963: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
964: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
965: {
966: Mat_SeqAIJKokkos *aijkok;
968: PetscFunctionBegin;
971: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
972: PetscCall(MatSeqAIJKokkosSyncDevice(A));
973: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
974: *kv = aijkok->a_dual.view_device();
975: PetscFunctionReturn(PETSC_SUCCESS);
976: }
978: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
979: {
980: PetscFunctionBegin;
983: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
988: {
989: Mat_SeqAIJKokkos *aijkok;
991: PetscFunctionBegin;
994: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
995: PetscCall(MatSeqAIJKokkosSyncDevice(A));
996: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
997: *kv = aijkok->a_dual.view_device();
998: PetscFunctionReturn(PETSC_SUCCESS);
999: }
1001: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1002: {
1003: PetscFunctionBegin;
1006: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1007: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1008: PetscFunctionReturn(PETSC_SUCCESS);
1009: }
1011: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1012: {
1013: Mat_SeqAIJKokkos *aijkok;
1015: PetscFunctionBegin;
1018: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1019: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1020: *kv = aijkok->a_dual.view_device();
1021: PetscFunctionReturn(PETSC_SUCCESS);
1022: }
1024: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1025: {
1026: PetscFunctionBegin;
1029: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1030: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1031: PetscFunctionReturn(PETSC_SUCCESS);
1032: }
1034: /* Computes Y += alpha X */
1035: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1036: {
1037: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1038: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1039: ConstMatScalarKokkosView Xa;
1040: MatScalarKokkosView Ya;
1042: PetscFunctionBegin;
1043: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1044: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1045: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1046: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1047: PetscCall(PetscLogGpuTimeBegin());
1049: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1050: /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1051: PetscBool e;
1052: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1053: if (e) {
1054: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1055: if (e) pattern = SAME_NONZERO_PATTERN;
1056: }
1057: }
1059: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1060: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1061: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1062: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1063: */
1064: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1065: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1066: Xa = xkok->a_dual.view_device();
1067: Ya = ykok->a_dual.view_device();
1069: if (pattern == SAME_NONZERO_PATTERN) {
1070: KokkosBlas::axpy(alpha, Xa, Ya);
1071: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1072: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1073: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1074: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1076: Kokkos::parallel_for(
1077: Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1078: PetscInt i = t.league_rank(); /* row i */
1079: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1080: PetscInt p, q = Yi(i);
1081: for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */
1082: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1083: if (Xj(p) == Yj(q)) { /* Found it */
1084: Ya(q) += alpha * Xa(p);
1085: q++;
1086: } else {
1087: /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1088: Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1089: */
1090: if (Yi(i) != Yi(i + 1))
1091: Ya(Yi(i)) =
1092: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1093: Kokkos::ArithTraits<PetscScalar>::nan();
1094: #else
1095: Kokkos::Experimental::nan("1");
1096: #endif
1097: }
1098: }
1099: });
1100: });
1101: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1102: } else { /* different nonzero patterns */
1103: Mat Z;
1104: KokkosCsrMatrix zcsr;
1105: KernelHandle kh;
1106: kh.create_spadd_handle(false);
1107: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1108: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1109: zkok = new Mat_SeqAIJKokkos(zcsr);
1110: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1111: PetscCall(MatHeaderReplace(Y, &Z));
1112: kh.destroy_spadd_handle();
1113: }
1114: PetscCall(PetscLogGpuTimeEnd());
1115: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1116: PetscFunctionReturn(PETSC_SUCCESS);
1117: }
1119: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1120: {
1121: Mat_SeqAIJKokkos *akok;
1122: Mat_SeqAIJ *aseq;
1124: PetscFunctionBegin;
1125: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1126: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1127: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1128: delete akok;
1129: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1130: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1131: akok->SetUpCOO(aseq);
1132: PetscFunctionReturn(PETSC_SUCCESS);
1133: }
1135: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1136: {
1137: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1138: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139: PetscCount Annz = aseq->nz;
1140: const PetscCountKokkosView &jmap = akok->jmap_d;
1141: const PetscCountKokkosView &perm = akok->perm_d;
1142: MatScalarKokkosView Aa;
1143: ConstMatScalarKokkosView kv;
1144: PetscMemType memtype;
1146: PetscFunctionBegin;
1147: PetscCall(PetscGetMemType(v, &memtype));
1148: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1149: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1150: } else {
1151: kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1152: }
1154: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1155: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1157: Kokkos::parallel_for(
1158: Annz, KOKKOS_LAMBDA(const PetscCount i) {
1159: PetscScalar sum = 0.0;
1160: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1161: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1162: });
1164: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1165: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1166: PetscFunctionReturn(PETSC_SUCCESS);
1167: }
1169: PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1170: {
1171: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1172: MatScalarKokkosView Aa;
1173: const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1174: PetscInt m = A->rmap->n;
1175: ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */
1177: PetscFunctionBegin;
1178: PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1179: Kokkos::parallel_for(
1180: m, KOKKOS_LAMBDA(const PetscInt i) {
1181: PetscScalar tmp;
1182: if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1183: tmp = Aa(Ai(i));
1184: Aa(Ai(i)) = Aa(Adiag(i));
1185: Aa(Adiag(i)) = tmp;
1186: }
1187: });
1188: PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1189: PetscFunctionReturn(PETSC_SUCCESS);
1190: }
1192: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1193: {
1194: PetscFunctionBegin;
1195: PetscCall(MatSeqAIJKokkosSyncHost(A));
1196: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1197: B->offloadmask = PETSC_OFFLOAD_CPU;
1198: PetscFunctionReturn(PETSC_SUCCESS);
1199: }
1201: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1202: {
1203: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1205: PetscFunctionBegin;
1206: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1207: A->boundtocpu = PETSC_FALSE;
1209: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1210: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1211: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1212: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1213: A->ops->scale = MatScale_SeqAIJKokkos;
1214: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1215: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1216: A->ops->mult = MatMult_SeqAIJKokkos;
1217: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1218: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1219: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1220: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1221: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1222: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1223: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1224: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1225: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1226: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1227: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1228: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1229: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1230: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1231: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1232: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1234: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1235: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1236: PetscFunctionReturn(PETSC_SUCCESS);
1237: }
1239: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1240: {
1241: Mat_SeqAIJ *aseq;
1242: PetscInt i, m, n;
1244: PetscFunctionBegin;
1245: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1247: m = akok->nrows();
1248: n = akok->ncols();
1249: PetscCall(MatSetSizes(A, m, n, m, n));
1250: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1252: /* Set up data structures of A as a MATSEQAIJ */
1253: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1254: aseq = (Mat_SeqAIJ *)(A)->data;
1256: akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1257: akok->j_dual.sync_host();
1259: aseq->i = akok->i_host_data();
1260: aseq->j = akok->j_host_data();
1261: aseq->a = akok->a_host_data();
1262: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1263: aseq->singlemalloc = PETSC_FALSE;
1264: aseq->free_a = PETSC_FALSE;
1265: aseq->free_ij = PETSC_FALSE;
1266: aseq->nz = akok->nnz();
1267: aseq->maxnz = aseq->nz;
1269: PetscCall(PetscMalloc1(m, &aseq->imax));
1270: PetscCall(PetscMalloc1(m, &aseq->ilen));
1271: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1273: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274: akok->nonzerostate = A->nonzerostate;
1275: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1276: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1277: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1278: PetscFunctionReturn(PETSC_SUCCESS);
1279: }
1281: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1283: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284: */
1285: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1286: {
1287: PetscFunctionBegin;
1288: PetscCall(MatCreate(comm, A));
1289: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1290: PetscFunctionReturn(PETSC_SUCCESS);
1291: }
1293: /*@C
1294: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1295: (the default parallel PETSc format). This matrix will ultimately be handled by
1296: Kokkos for calculations. For good matrix
1297: assembly performance the user should preallocate the matrix storage by setting
1298: the parameter nz (or the array nnz). By setting these parameters accurately,
1299: performance during matrix assembly can be increased by more than a factor of 50.
1301: Collective
1303: Input Parameters:
1304: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1305: . m - number of rows
1306: . n - number of columns
1307: . nz - number of nonzeros per row (same for all rows)
1308: - nnz - array containing the number of nonzeros in the various rows
1309: (possibly different for each row) or `NULL`
1311: Output Parameter:
1312: . A - the matrix
1314: Level: intermediate
1316: Notes:
1317: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1318: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1319: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1321: If `nnz` is given then `nz` is ignored
1323: The AIJ format, also called
1324: compressed row storage, is fully compatible with standard Fortran
1325: storage. That is, the stored row and column indices can begin at
1326: either one (as in Fortran) or zero. See the users' manual for details.
1328: Specify the preallocated storage with either `nz` or `nnz` (not both).
1329: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1330: allocation.
1332: By default, this format uses inodes (identical nodes) when possible, to
1333: improve numerical efficiency of matrix-vector products and solves. We
1334: search for consecutive rows with the same nonzero structure, thereby
1335: reusing matrix information to achieve increased efficiency.
1337: .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1338: @*/
1339: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1340: {
1341: PetscFunctionBegin;
1342: PetscCall(PetscKokkosInitializeCheck());
1343: PetscCall(MatCreate(comm, A));
1344: PetscCall(MatSetSizes(*A, m, n, m, n));
1345: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1346: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1347: PetscFunctionReturn(PETSC_SUCCESS);
1348: }
1350: typedef Kokkos::TeamPolicy<>::member_type team_member;
1351: //
1352: // This factorization exploits block diagonal matrices with "Nf" (not used).
1353: // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1354: //
1355: static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1356: {
1357: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1358: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1359: IS isrow = b->row, isicol = b->icol;
1360: const PetscInt *r_h, *ic_h;
1361: const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1362: const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1363: PetscScalar *ba_d = baijkok->a_dual.view_device().data();
1364: PetscBool row_identity, col_identity;
1365: PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1367: PetscFunctionBegin;
1368: PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1369: PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1370: PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1371: PetscCall(ISGetIndices(isrow, &r_h));
1372: PetscCall(ISGetIndices(isicol, &ic_h));
1373: PetscCall(ISGetSize(isicol, &nc));
1374: PetscCall(PetscLogGpuTimeBegin());
1375: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1376: {
1377: #define KOKKOS_SHARED_LEVEL 1
1378: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1379: using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1380: using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1381: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1382: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n);
1383: const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1384: Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc);
1385: size_t flops_h = 0.0;
1386: Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h);
1387: Kokkos::View<size_t> d_flops_k("flops");
1388: const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1389: const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1390: Kokkos::deep_copy(d_flops_k, h_flops_k);
1391: Kokkos::deep_copy(d_r_k, h_r_k);
1392: Kokkos::deep_copy(d_ic_k, h_ic_k);
1393: // Fill A --> fact
1394: Kokkos::parallel_for(
1395: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1396: const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1397: const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1398: const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1399: // zero rows of B
1400: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1401: PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1402: PetscScalar *baL = ba_d + bi_d[rowb];
1403: PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1404: /* zero (unfactored row) */
1405: for (int j = 0; j < nzbL; j++) baL[j] = 0;
1406: for (int j = 0; j < nzbU; j++) baU[j] = 0;
1407: });
1408: // copy A into B
1409: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1410: PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1411: const PetscScalar *av = aa_d + ai_d[rowa];
1412: const PetscInt *ajtmp = aj_d + ai_d[rowa];
1413: /* load in initial (unfactored row) */
1414: for (int j = 0; j < nza; j++) {
1415: PetscInt colb = ic[ajtmp[j]];
1416: PetscScalar vala = av[j];
1417: if (colb == rowb) {
1418: *(ba_d + bdiag_d[rowb]) = vala;
1419: } else {
1420: const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1421: PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1422: PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1423: for (int j = 0; j < nz; j++) {
1424: if (pbj[j] == colb) {
1425: pba[j] = vala;
1426: set++;
1427: break;
1428: }
1429: }
1430: #if !defined(PETSC_HAVE_SYCL)
1431: if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1432: #endif
1433: }
1434: }
1435: });
1436: });
1437: Kokkos::fence();
1439: Kokkos::parallel_for(
1440: Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1441: sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1442: scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1443: sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1444: const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1445: const PetscInt start = field * nloc, end = start + nloc;
1446: Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1447: // A22 panel update for each row A(1,:) and col A(:,1)
1448: for (int ii = start; ii < end - 1; ii++) {
1449: const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1450: const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end)
1451: const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni);
1452: const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1453: Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1454: PetscInt kIdx = j * Ni + field_block_idx;
1455: if (kIdx >= nzUi) /* void */
1456: ;
1457: else {
1458: const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1459: const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1460: const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1461: size_t st_idx;
1462: // find and do L(k,i) = A(:k,i) / A(i,i)
1463: Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1464: // get column, there has got to be a better way
1465: Kokkos::parallel_reduce(
1466: Kokkos::ThreadVectorRange(team, nzL),
1467: [&](const int &j, size_t &idx) {
1468: if (pjL[j] == ii) {
1469: PetscScalar *pLki = ba_d + bi_d[myk] + j;
1470: idx = j; // output
1471: *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i)
1472: }
1473: },
1474: st_idx);
1475: Kokkos::single(Kokkos::PerThread(team), [=]() {
1476: colkIdx() = st_idx;
1477: L_ki() = *(ba_d + bi_d[myk] + st_idx);
1478: });
1479: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1480: if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1481: #endif
1482: // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i
1483: // U(i+1,:end)
1484: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1485: PetscScalar Uij = baUi[uiIdx];
1486: PetscInt col = bjUi[uiIdx];
1487: if (col == myk) {
1488: // A_kk = A_kk - L_ki * U_ij(k)
1489: PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1490: *Akkv = *Akkv - L_ki() * Uij; // UiK
1491: } else {
1492: PetscScalar *start, *end, *pAkjv = NULL;
1493: PetscInt high, low;
1494: const PetscInt *startj;
1495: if (col < myk) { // L
1496: PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1497: PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1498: start = pLki + 1; // start at pLki+1, A22(myk,1)
1499: startj = bj_d + bi_d[myk] + idx;
1500: end = ba_d + bi_d[myk + 1];
1501: } else {
1502: PetscInt idx = bdiag_d[myk + 1] + 1;
1503: start = ba_d + idx;
1504: startj = bj_d + idx;
1505: end = ba_d + bdiag_d[myk];
1506: }
1507: // search for 'col', use bisection search - TODO
1508: low = 0;
1509: high = (PetscInt)(end - start);
1510: while (high - low > 5) {
1511: int t = (low + high) / 2;
1512: if (startj[t] > col) high = t;
1513: else low = t;
1514: }
1515: for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1516: if (startj[pAkjv - start] == col) break;
1517: }
1518: #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1519: if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1520: #endif
1521: *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1522: }
1523: });
1524: }
1525: });
1526: team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1527: if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1528: } /* endof for (i=0; i<n; i++) { */
1529: Kokkos::single(Kokkos::PerTeam(team), [=]() {
1530: Kokkos::atomic_add(&d_flops_k(), flops());
1531: flops() = 0;
1532: });
1533: });
1534: Kokkos::fence();
1535: Kokkos::deep_copy(h_flops_k, d_flops_k);
1536: PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1537: Kokkos::parallel_for(
1538: Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1539: const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni;
1540: const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1541: /* Invert diagonal for simpler triangular solves */
1542: Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1543: int i = start + outer_index * Ni + lg_rank % Ni;
1544: if (i < end) {
1545: PetscScalar *pv = ba_d + bdiag_d[i];
1546: *pv = 1.0 / (*pv);
1547: }
1548: });
1549: });
1550: }
1551: PetscCall(PetscLogGpuTimeEnd());
1552: PetscCall(ISRestoreIndices(isicol, &ic_h));
1553: PetscCall(ISRestoreIndices(isrow, &r_h));
1555: PetscCall(ISIdentity(isrow, &row_identity));
1556: PetscCall(ISIdentity(isicol, &col_identity));
1557: if (b->inode.size) {
1558: B->ops->solve = MatSolve_SeqAIJ_Inode;
1559: } else if (row_identity && col_identity) {
1560: B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1561: } else {
1562: B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1563: }
1564: B->offloadmask = PETSC_OFFLOAD_GPU;
1565: PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1566: B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this
1567: B->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1568: B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1569: B->ops->matsolve = MatMatSolve_SeqAIJ;
1570: B->assembled = PETSC_TRUE;
1571: B->preallocated = PETSC_TRUE;
1573: PetscFunctionReturn(PETSC_SUCCESS);
1574: }
1576: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1577: {
1578: PetscFunctionBegin;
1579: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1580: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1581: PetscFunctionReturn(PETSC_SUCCESS);
1582: }
1584: static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1585: {
1586: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1588: PetscFunctionBegin;
1589: if (!factors->sptrsv_symbolic_completed) {
1590: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1591: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1592: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1593: }
1594: PetscFunctionReturn(PETSC_SUCCESS);
1595: }
1597: /* Check if we need to update factors etc for transpose solve */
1598: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1599: {
1600: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1601: MatColIdxType n = A->rmap->n;
1603: PetscFunctionBegin;
1604: if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1605: /* Update L^T and do sptrsv symbolic */
1606: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1607: Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1608: factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1609: factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1611: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1612: factors->iLt_d, factors->jLt_d, factors->aLt_d);
1614: /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1615: We have to sort the indices, until KK provides finer control options.
1616: */
1617: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1619: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1621: /* Update U^T and do sptrsv symbolic */
1622: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1623: Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1624: factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1625: factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1627: transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1628: factors->iUt_d, factors->jUt_d, factors->aUt_d);
1630: /* Sort indices. See comments above */
1631: sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1633: KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1634: factors->transpose_updated = PETSC_TRUE;
1635: }
1636: PetscFunctionReturn(PETSC_SUCCESS);
1637: }
1639: /* Solve Ax = b, with A = LU */
1640: static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1641: {
1642: ConstPetscScalarKokkosView bv;
1643: PetscScalarKokkosView xv;
1644: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1646: PetscFunctionBegin;
1647: PetscCall(PetscLogGpuTimeBegin());
1648: PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1649: PetscCall(VecGetKokkosView(b, &bv));
1650: PetscCall(VecGetKokkosViewWrite(x, &xv));
1651: /* Solve L tmpv = b */
1652: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1653: /* Solve Ux = tmpv */
1654: PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1655: PetscCall(VecRestoreKokkosView(b, &bv));
1656: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1657: PetscCall(PetscLogGpuTimeEnd());
1658: PetscFunctionReturn(PETSC_SUCCESS);
1659: }
1661: /* Solve A^T x = b, where A^T = U^T L^T */
1662: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1663: {
1664: ConstPetscScalarKokkosView bv;
1665: PetscScalarKokkosView xv;
1666: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1668: PetscFunctionBegin;
1669: PetscCall(PetscLogGpuTimeBegin());
1670: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1671: PetscCall(VecGetKokkosView(b, &bv));
1672: PetscCall(VecGetKokkosViewWrite(x, &xv));
1673: /* Solve U^T tmpv = b */
1674: KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1676: /* Solve L^T x = tmpv */
1677: KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1678: PetscCall(VecRestoreKokkosView(b, &bv));
1679: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1680: PetscCall(PetscLogGpuTimeEnd());
1681: PetscFunctionReturn(PETSC_SUCCESS);
1682: }
1684: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1685: {
1686: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1687: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1688: PetscInt fill_lev = info->levels;
1690: PetscFunctionBegin;
1691: PetscCall(PetscLogGpuTimeBegin());
1692: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1694: auto a_d = aijkok->a_dual.view_device();
1695: auto i_d = aijkok->i_dual.view_device();
1696: auto j_d = aijkok->j_dual.view_device();
1698: KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1700: B->assembled = PETSC_TRUE;
1701: B->preallocated = PETSC_TRUE;
1702: B->ops->solve = MatSolve_SeqAIJKokkos;
1703: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos;
1704: B->ops->matsolve = NULL;
1705: B->ops->matsolvetranspose = NULL;
1706: B->offloadmask = PETSC_OFFLOAD_GPU;
1708: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1709: factors->transpose_updated = PETSC_FALSE;
1710: factors->sptrsv_symbolic_completed = PETSC_FALSE;
1711: /* TODO: log flops, but how to know that? */
1712: PetscCall(PetscLogGpuTimeEnd());
1713: PetscFunctionReturn(PETSC_SUCCESS);
1714: }
1716: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1717: {
1718: Mat_SeqAIJKokkos *aijkok;
1719: Mat_SeqAIJ *b;
1720: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1721: PetscInt fill_lev = info->levels;
1722: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1723: PetscInt n = A->rmap->n;
1725: PetscFunctionBegin;
1726: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1727: /* Rebuild factors */
1728: if (factors) {
1729: factors->Destroy();
1730: } /* Destroy the old if it exists */
1731: else {
1732: B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1733: }
1735: /* Create a spiluk handle and then do symbolic factorization */
1736: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1737: factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1739: auto spiluk_handle = factors->kh.get_spiluk_handle();
1741: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1742: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1743: Kokkos::realloc(factors->iU_d, n + 1);
1744: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1746: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
1747: auto i_d = aijkok->i_dual.view_device();
1748: auto j_d = aijkok->j_dual.view_device();
1749: KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1750: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1752: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1753: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1754: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1755: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1757: /* TODO: add options to select sptrsv algorithms */
1758: /* Create sptrsv handles for L, U and their transpose */
1759: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1760: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1761: #else
1762: auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1763: #endif
1765: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1766: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1767: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1768: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1770: /* Fill fields of the factor matrix B */
1771: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1772: b = (Mat_SeqAIJ *)B->data;
1773: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1774: B->info.fill_ratio_given = info->fill;
1775: B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1777: B->offloadmask = PETSC_OFFLOAD_GPU;
1778: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1779: PetscFunctionReturn(PETSC_SUCCESS);
1780: }
1782: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1783: {
1784: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1785: const PetscInt nrows = A->rmap->n;
1787: PetscFunctionBegin;
1788: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1789: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1790: // move B data into Kokkos
1791: PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1792: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1793: {
1794: Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1795: if (!baijkok->diag_d.extent(0)) {
1796: const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1797: baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1798: Kokkos::deep_copy(baijkok->diag_d, h_diag);
1799: }
1800: }
1801: PetscFunctionReturn(PETSC_SUCCESS);
1802: }
1804: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1805: {
1806: PetscFunctionBegin;
1807: *type = MATSOLVERKOKKOS;
1808: PetscFunctionReturn(PETSC_SUCCESS);
1809: }
1811: static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1812: {
1813: PetscFunctionBegin;
1814: *type = MATSOLVERKOKKOSDEVICE;
1815: PetscFunctionReturn(PETSC_SUCCESS);
1816: }
1818: /*MC
1819: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1820: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1822: Level: beginner
1824: .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1825: M*/
1826: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1827: {
1828: PetscInt n = A->rmap->n;
1830: PetscFunctionBegin;
1831: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1832: PetscCall(MatSetSizes(*B, n, n, n, n));
1833: (*B)->factortype = ftype;
1834: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1835: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1837: if (ftype == MAT_FACTOR_LU) {
1838: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1839: (*B)->canuseordering = PETSC_TRUE;
1840: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1841: } else if (ftype == MAT_FACTOR_ILU) {
1842: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1843: (*B)->canuseordering = PETSC_FALSE;
1844: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1845: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1847: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1848: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1849: PetscFunctionReturn(PETSC_SUCCESS);
1850: }
1852: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1853: {
1854: PetscInt n = A->rmap->n;
1856: PetscFunctionBegin;
1857: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1858: PetscCall(MatSetSizes(*B, n, n, n, n));
1859: (*B)->factortype = ftype;
1860: (*B)->canuseordering = PETSC_TRUE;
1861: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1862: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1864: if (ftype == MAT_FACTOR_LU) {
1865: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1866: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1867: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1869: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1870: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1871: PetscFunctionReturn(PETSC_SUCCESS);
1872: }
1874: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1875: {
1876: PetscFunctionBegin;
1877: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1878: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1879: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1880: PetscFunctionReturn(PETSC_SUCCESS);
1881: }
1883: /* Utility to print out a KokkosCsrMatrix for debugging */
1884: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1885: {
1886: const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1887: const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1888: const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1889: const PetscInt *i = iv.data();
1890: const PetscInt *j = jv.data();
1891: const PetscScalar *a = av.data();
1892: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1894: PetscFunctionBegin;
1895: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1896: for (PetscInt k = 0; k < m; k++) {
1897: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1898: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1899: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1900: }
1901: PetscFunctionReturn(PETSC_SUCCESS);
1902: }