Actual source code: densehip.hip.cpp
1: /*
2: Defines the matrix operations for sequential dense with HIP
3: Portions of this code are under:
4: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
5: */
6: #include <petscpkg_version.h>
7: #include <../src/mat/impls/dense/seq/dense.h>
8: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
9: #include <petsc/private/petsclegacycupmblas.h>
10: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
11: #include <hipsolver/hipsolver.h>
12: #else
13: #include <hipsolver.h>
14: #endif
15: #include <thrust/device_ptr.h>
16: #include <thrust/functional.h>
17: #include <thrust/iterator/counting_iterator.h>
18: #include <thrust/iterator/transform_iterator.h>
19: #include <thrust/iterator/permutation_iterator.h>
20: #include <thrust/transform.h>
21: #include <thrust/device_vector.h>
23: using VecSeq_HIP = Petsc::vec::cupm::impl::VecSeq_CUPM<Petsc::device::cupm::DeviceType::HIP>;
25: typedef struct {
26: PetscScalar *d_v; /* pointer to the matrix on the GPU */
27: PetscBool user_alloc;
28: PetscScalar *unplacedarray; /* if one called MatHIPDensePlaceArray(), this is where it stashed the original */
29: PetscBool unplaced_user_alloc;
30: /* factorization support */
31: PetscHipBLASInt *d_fact_ipiv; /* device pivots */
32: PetscScalar *d_fact_tau; /* device QR tau vector */
33: PetscScalar *d_fact_work; /* device workspace */
34: PetscHipBLASInt fact_lwork;
35: PetscHipBLASInt *d_fact_info; /* device info */
36: /* workspace */
37: Vec workvec;
38: } Mat_SeqDenseHIP;
40: PetscErrorCode MatSeqDenseHIPSetPreallocation(Mat A, PetscScalar *d_data)
41: {
42: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
43: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
44: PetscBool iship;
46: PetscFunctionBegin;
47: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &iship));
48: if (!iship) PetscFunctionReturn(PETSC_SUCCESS);
49: /* it may happen CPU preallocation has not been performed */
50: PetscCall(PetscLayoutSetUp(A->rmap));
51: PetscCall(PetscLayoutSetUp(A->cmap));
52: if (cA->lda <= 0) cA->lda = A->rmap->n;
53: if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
54: if (!d_data) { /* petsc-allocated storage */
55: size_t sz;
56: PetscCall(PetscIntMultError(cA->lda, A->cmap->n, NULL));
57: sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
58: PetscCallHIP(hipMalloc((void **)&dA->d_v, sz));
59: PetscCallHIP(hipMemset(dA->d_v, 0, sz));
60: dA->user_alloc = PETSC_FALSE;
61: } else { /* user-allocated storage */
62: dA->d_v = d_data;
63: dA->user_alloc = PETSC_TRUE;
64: }
65: A->offloadmask = PETSC_OFFLOAD_GPU;
66: A->preallocated = PETSC_TRUE;
67: A->assembled = PETSC_TRUE;
68: PetscFunctionReturn(PETSC_SUCCESS);
69: }
71: PetscErrorCode MatSeqDenseHIPCopyFromGPU(Mat A)
72: {
73: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
74: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
76: PetscFunctionBegin;
77: PetscCheckTypeName(A, MATSEQDENSEHIP);
78: PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
79: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
80: if (!cA->v) { /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
81: PetscCall(MatSeqDenseSetPreallocation(A, NULL));
82: }
83: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0));
84: if (cA->lda > A->rmap->n) {
85: PetscCallHIP(hipMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyDeviceToHost));
86: } else {
87: PetscCallHIP(hipMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyDeviceToHost));
88: }
89: PetscCall(PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
90: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0));
92: A->offloadmask = PETSC_OFFLOAD_BOTH;
93: }
94: PetscFunctionReturn(PETSC_SUCCESS);
95: }
97: PetscErrorCode MatSeqDenseHIPCopyToGPU(Mat A)
98: {
99: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
100: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
101: PetscBool copy;
103: PetscFunctionBegin;
104: PetscCheckTypeName(A, MATSEQDENSEHIP);
105: if (A->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
106: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
107: PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
108: if (copy) {
109: if (!dA->d_v) { /* Allocate GPU memory if not present */
110: PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
111: }
112: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0));
113: if (cA->lda > A->rmap->n) {
114: PetscCallHIP(hipMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyHostToDevice));
115: } else {
116: PetscCallHIP(hipMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyHostToDevice));
117: }
118: PetscCall(PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
119: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0));
121: A->offloadmask = PETSC_OFFLOAD_BOTH;
122: }
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: static PetscErrorCode MatCopy_SeqDenseHIP(Mat A, Mat B, MatStructure str)
127: {
128: const PetscScalar *va;
129: PetscScalar *vb;
130: PetscInt lda1, lda2, m = A->rmap->n, n = A->cmap->n;
132: PetscFunctionBegin;
133: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
134: if (A->ops->copy != B->ops->copy) {
135: PetscCall(MatCopy_Basic(A, B, str));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
138: PetscCheck(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
139: PetscCall(MatDenseHIPGetArrayRead(A, &va));
140: PetscCall(MatDenseHIPGetArrayWrite(B, &vb));
141: PetscCall(MatDenseGetLDA(A, &lda1));
142: PetscCall(MatDenseGetLDA(B, &lda2));
143: PetscCall(PetscLogGpuTimeBegin());
144: if (lda1 > m || lda2 > m) {
145: PetscCallHIP(hipMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, hipMemcpyDeviceToDevice));
146: } else {
147: PetscCallHIP(hipMemcpy(vb, va, m * (n * sizeof(PetscScalar)), hipMemcpyDeviceToDevice));
148: }
149: PetscCall(PetscLogGpuTimeEnd());
150: PetscCall(MatDenseHIPRestoreArrayWrite(B, &vb));
151: PetscCall(MatDenseHIPRestoreArrayRead(A, &va));
152: PetscFunctionReturn(PETSC_SUCCESS);
153: }
155: static PetscErrorCode MatZeroEntries_SeqDenseHIP(Mat A)
156: {
157: PetscScalar *va;
158: PetscInt lda, m = A->rmap->n, n = A->cmap->n;
160: PetscFunctionBegin;
161: PetscCall(MatDenseHIPGetArrayWrite(A, &va));
162: PetscCall(MatDenseGetLDA(A, &lda));
163: PetscCall(PetscLogGpuTimeBegin());
164: if (lda > m) {
165: PetscCallHIP(hipMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n));
166: } else {
167: PetscCallHIP(hipMemset(va, 0, m * (n * sizeof(PetscScalar))));
168: }
169: PetscCallHIP(WaitForHIP());
170: PetscCall(PetscLogGpuTimeEnd());
171: PetscCall(MatDenseHIPRestoreArrayWrite(A, &va));
172: PetscFunctionReturn(PETSC_SUCCESS);
173: }
175: static PetscErrorCode MatDenseHIPPlaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
176: {
177: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
178: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
180: PetscFunctionBegin;
181: PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
182: PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
183: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
184: if (aa->v) PetscCall(MatSeqDenseHIPCopyToGPU(A));
185: dA->unplacedarray = dA->d_v;
186: dA->unplaced_user_alloc = dA->user_alloc;
187: dA->d_v = (PetscScalar *)a;
188: dA->user_alloc = PETSC_TRUE;
189: PetscFunctionReturn(PETSC_SUCCESS);
190: }
192: static PetscErrorCode MatDenseHIPResetArray_SeqDenseHIP(Mat A)
193: {
194: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
195: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
197: PetscFunctionBegin;
198: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
199: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
200: if (a->v) PetscCall(MatSeqDenseHIPCopyToGPU(A));
201: dA->d_v = dA->unplacedarray;
202: dA->user_alloc = dA->unplaced_user_alloc;
203: dA->unplacedarray = NULL;
204: PetscFunctionReturn(PETSC_SUCCESS);
205: }
207: static PetscErrorCode MatDenseHIPReplaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
208: {
209: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
210: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
212: PetscFunctionBegin;
213: PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
214: PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
215: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
216: if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
217: dA->d_v = (PetscScalar *)a;
218: dA->user_alloc = PETSC_FALSE;
219: PetscFunctionReturn(PETSC_SUCCESS);
220: }
222: static PetscErrorCode MatDenseHIPGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
223: {
224: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
226: PetscFunctionBegin;
227: if (!dA->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
228: *a = dA->d_v;
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: static PetscErrorCode MatDenseHIPRestoreArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
233: {
234: PetscFunctionBegin;
235: if (a) *a = NULL;
236: PetscFunctionReturn(PETSC_SUCCESS);
237: }
239: static PetscErrorCode MatDenseHIPGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
240: {
241: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
243: PetscFunctionBegin;
244: PetscCall(MatSeqDenseHIPCopyToGPU(A));
245: *a = dA->d_v;
246: PetscFunctionReturn(PETSC_SUCCESS);
247: }
249: static PetscErrorCode MatDenseHIPRestoreArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
250: {
251: PetscFunctionBegin;
252: if (a) *a = NULL;
253: PetscFunctionReturn(PETSC_SUCCESS);
254: }
256: static PetscErrorCode MatDenseHIPGetArray_SeqDenseHIP(Mat A, PetscScalar **a)
257: {
258: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
260: PetscFunctionBegin;
261: PetscCall(MatSeqDenseHIPCopyToGPU(A));
262: *a = dA->d_v;
263: PetscFunctionReturn(PETSC_SUCCESS);
264: }
266: static PetscErrorCode MatDenseHIPRestoreArray_SeqDenseHIP(Mat A, PetscScalar **a)
267: {
268: PetscFunctionBegin;
269: if (a) *a = NULL;
270: PetscFunctionReturn(PETSC_SUCCESS);
271: }
273: PETSC_EXTERN PetscErrorCode MatSeqDenseHIPInvertFactors_Private(Mat A)
274: {
275: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
276: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
277: PetscScalar *da;
278: hipsolverHandle_t handle;
279: PetscHipBLASInt n, lda;
280: #if defined(PETSC_USE_DEBUG)
281: PetscHipBLASInt info;
282: #endif
284: PetscFunctionBegin;
285: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
286: PetscCall(PetscHIPSOLVERGetHandle(&handle));
287: PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
288: PetscCall(PetscHipBLASIntCast(a->lda, &lda));
289: PetscCheck(A->factortype != MAT_FACTOR_LU, PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDngetri not implemented");
290: if (A->factortype == MAT_FACTOR_CHOLESKY) {
291: if (!dA->d_fact_ipiv) { /* spd */
292: PetscHipBLASInt il;
294: PetscCall(MatDenseHIPGetArray(A, &da));
295: PetscCallHIPSOLVER(hipsolverDnXpotri_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &il));
296: if (il > dA->fact_lwork) {
297: dA->fact_lwork = il;
299: PetscCallHIP(hipFree(dA->d_fact_work));
300: PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
301: }
302: PetscCall(PetscLogGpuTimeBegin());
303: PetscCallHIPSOLVER(hipsolverDnXpotri(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
304: PetscCall(PetscLogGpuTimeEnd());
305: PetscCall(MatDenseHIPRestoreArray(A, &da));
306: /* TODO (write hip kernel) */
307: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
308: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytri not implemented");
309: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
310: #if defined(PETSC_USE_DEBUG)
311: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
312: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: leading minor of order %d is zero", info);
313: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
314: #endif
315: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
316: A->ops->solve = NULL;
317: A->ops->solvetranspose = NULL;
318: A->ops->matsolve = NULL;
319: A->factortype = MAT_FACTOR_NONE;
321: PetscCall(PetscFree(A->solvertype));
322: PetscFunctionReturn(PETSC_SUCCESS);
323: }
325: static PetscErrorCode MatSolve_SeqDenseHIP_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
326: {
327: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
328: PetscScalar *y;
329: PetscHipBLASInt m = 0, k = 0;
330: PetscBool xiship, yiship, aiship;
332: PetscFunctionBegin;
333: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
334: PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
335: PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
336: PetscCall(PetscObjectTypeCompare((PetscObject)xx, VECSEQHIP, &xiship));
337: PetscCall(PetscObjectTypeCompare((PetscObject)yy, VECSEQHIP, &yiship));
338: {
339: const PetscScalar *x;
340: PetscBool xishost = PETSC_TRUE;
342: /* The logic here is to try to minimize the amount of memory copying:
343: if we call VecHIPGetArrayRead(X,&x) every time xiship and the
344: data is not offloaded to the GPU yet, then the data is copied to the
345: GPU. But we are only trying to get the data in order to copy it into the y
346: array. So the array x will be wherever the data already is so that
347: only one memcpy is performed */
348: if (xiship && xx->offloadmask & PETSC_OFFLOAD_GPU) {
349: PetscCall(VecHIPGetArrayRead(xx, &x));
350: xishost = PETSC_FALSE;
351: } else PetscCall(VecGetArrayRead(xx, &x));
352: if (k < m || !yiship) {
353: if (!dA->workvec) PetscCall(VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
354: PetscCall(VecHIPGetArrayWrite(dA->workvec, &y));
355: } else PetscCall(VecHIPGetArrayWrite(yy, &y));
356: PetscCallHIP(hipMemcpy(y, x, m * sizeof(PetscScalar), xishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice));
357: }
358: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship));
359: if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
360: PetscCall((*matsolve)(A, y, m, m, 1, k, transpose));
361: if (!aiship) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
362: if (k < m || !yiship) {
363: PetscScalar *yv;
365: /* The logic here is that the data is not yet in either yy's GPU array or its
366: CPU array. There is nothing in the interface to say where the user would like
367: it to end up. So we choose the GPU, because it is the faster option */
368: if (yiship) PetscCall(VecHIPGetArrayWrite(yy, &yv));
369: else PetscCall(VecGetArray(yy, &yv));
370: PetscCallHIP(hipMemcpy(yv, y, k * sizeof(PetscScalar), yiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
371: if (yiship) PetscCall(VecHIPRestoreArrayWrite(yy, &yv));
372: else PetscCall(VecRestoreArray(yy, &yv));
373: PetscCall(VecHIPRestoreArrayWrite(dA->workvec, &y));
374: } else PetscCall(VecHIPRestoreArrayWrite(yy, &y));
375: PetscFunctionReturn(PETSC_SUCCESS);
376: }
378: static PetscErrorCode MatMatSolve_SeqDenseHIP_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
379: {
380: PetscScalar *y;
381: PetscInt n, _ldb, _ldx;
382: PetscBool biship, xiship, aiship;
383: PetscHipBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;
385: PetscFunctionBegin;
386: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
387: PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
388: PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
389: PetscCall(MatGetSize(B, NULL, &n));
390: PetscCall(PetscHipBLASIntCast(n, &nrhs));
391: PetscCall(MatDenseGetLDA(B, &_ldb));
392: PetscCall(PetscHipBLASIntCast(_ldb, &ldb));
393: PetscCall(MatDenseGetLDA(X, &_ldx));
394: PetscCall(PetscHipBLASIntCast(_ldx, &ldx));
395: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
396: PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSEHIP, &xiship));
397: /* The logic here is to try to minimize the amount of memory copying:
398: if we call MatDenseHIPGetArrayRead(B,&b) every time biship and the
399: data is not offloaded to the GPU yet, then the data is copied to the
400: GPU. But we are only trying to get the data in order to copy it into the y
401: array. So the array b will be wherever the data already is so that
402: only one memcpy is performed */
403: const PetscScalar *b;
404: /* some copying from B will be involved */
405: PetscBool bishost = PETSC_TRUE;
406: if (biship && B->offloadmask & PETSC_OFFLOAD_GPU) {
407: PetscCall(MatDenseHIPGetArrayRead(B, &b));
408: bishost = PETSC_FALSE;
409: } else PetscCall(MatDenseGetArrayRead(B, &b));
410: if (ldx < m || !xiship) {
411: /* X's array cannot serve as the array (too small or not on device), B's
412: * array cannot serve as the array (const), so allocate a new array */
413: ldy = m;
414: PetscCallHIP(hipMalloc((void **)&y, nrhs * m * sizeof(PetscScalar)));
415: } else {
416: /* X's array should serve as the array */
417: ldy = ldx;
418: PetscCall(MatDenseHIPGetArrayWrite(X, &y));
419: }
420: PetscCallHIP(hipMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice));
421: if (bishost) PetscCall(MatDenseRestoreArrayRead(B, &b));
422: else PetscCall(MatDenseHIPRestoreArrayRead(B, &b));
424: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship));
425: if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
426: PetscCall((*matsolve)(A, y, ldy, m, nrhs, k, transpose));
427: if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
428: if (ldx < m || !xiship) {
429: PetscScalar *x;
431: /* The logic here is that the data is not yet in either X's GPU array or its
432: CPU array. There is nothing in the interface to say where the user would like
433: it to end up. So we choose the GPU, because it is the faster option */
434: if (xiship) PetscCall(MatDenseHIPGetArrayWrite(X, &x));
435: else PetscCall(MatDenseGetArray(X, &x));
436: PetscCallHIP(hipMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
437: if (xiship) PetscCall(MatDenseHIPRestoreArrayWrite(X, &x));
438: else PetscCall(MatDenseRestoreArray(X, &x));
439: PetscCallHIP(hipFree(y));
440: } else PetscCall(MatDenseHIPRestoreArrayWrite(X, &y));
441: PetscFunctionReturn(PETSC_SUCCESS);
442: }
444: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_LU(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
445: {
446: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
447: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
448: const PetscScalar *da;
449: PetscHipBLASInt lda;
450: hipsolverHandle_t handle;
451: int info;
453: PetscFunctionBegin;
454: PetscCall(MatDenseHIPGetArrayRead(A, &da));
455: PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
456: PetscCall(PetscHIPSOLVERGetHandle(&handle));
457: PetscCall(PetscLogGpuTimeBegin());
458: PetscCall(PetscInfo(A, "LU solve %d x %d on backend\n", m, k));
459: PetscCallHIPSOLVER(hipsolverDnXgetrs(handle, T ? HIPSOLVER_OP_T : HIPSOLVER_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
460: PetscCall(PetscLogGpuTimeEnd());
461: PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
462: if (PetscDefined(USE_DEBUG)) {
463: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
464: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
465: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
466: }
467: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
468: PetscFunctionReturn(PETSC_SUCCESS);
469: }
471: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_Cholesky(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
472: {
473: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
474: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
475: const PetscScalar *da;
476: PetscHipBLASInt lda;
477: hipsolverHandle_t handle;
478: int info;
480: PetscFunctionBegin;
481: PetscCall(MatDenseHIPGetArrayRead(A, &da));
482: PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
483: PetscCall(PetscHIPSOLVERGetHandle(&handle));
484: PetscCall(PetscLogGpuTimeBegin());
485: PetscCall(PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k));
486: if (!dA->d_fact_ipiv) { /* spd */
487: /* ========= Program hit hipErrorNotReady (error 34) due to "device not ready" on HIP API call to hipEventQuery. */
488: PetscCallHIPSOLVER(hipsolverDnXpotrs(handle, HIPSOLVER_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
489: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytrs not implemented");
490: PetscCall(PetscLogGpuTimeEnd());
491: PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
492: if (PetscDefined(USE_DEBUG)) {
493: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
494: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
495: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
496: }
497: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
498: PetscFunctionReturn(PETSC_SUCCESS);
499: }
501: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_QR(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
502: {
503: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
504: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
505: const PetscScalar *da;
506: PetscHipBLASInt lda, rank;
507: hipsolverHandle_t handle;
508: hipblasHandle_t bhandle;
509: int info;
510: hipsolverOperation_t trans;
511: PetscScalar one = 1.;
513: PetscFunctionBegin;
514: PetscCall(PetscHipBLASIntCast(mat->rank, &rank));
515: PetscCall(MatDenseHIPGetArrayRead(A, &da));
516: PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
517: PetscCall(PetscHIPSOLVERGetHandle(&handle));
518: PetscCall(PetscHIPBLASGetHandle(&bhandle));
519: PetscCall(PetscLogGpuTimeBegin());
520: PetscCall(PetscInfo(A, "QR solve %d x %d on backend\n", m, k));
521: if (!T) {
522: if (PetscDefined(USE_COMPLEX)) trans = HIPSOLVER_OP_C;
523: else trans = HIPSOLVER_OP_T;
524: PetscCallHIPSOLVER(hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
525: if (PetscDefined(USE_DEBUG)) {
526: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
527: PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
528: }
529: PetscCallHIPBLAS(hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
530: } else {
531: PetscCallHIPBLAS(hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
532: PetscCallHIPSOLVER(hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, HIPSOLVER_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
533: if (PetscDefined(USE_DEBUG)) {
534: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
535: PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
536: }
537: }
538: PetscCall(PetscLogGpuTimeEnd());
539: PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
540: PetscCall(PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank))));
541: PetscFunctionReturn(PETSC_SUCCESS);
542: }
544: static PetscErrorCode MatSolve_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
545: {
546: PetscFunctionBegin;
547: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU));
548: PetscFunctionReturn(PETSC_SUCCESS);
549: }
551: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
552: {
553: PetscFunctionBegin;
554: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU));
555: PetscFunctionReturn(PETSC_SUCCESS);
556: }
558: static PetscErrorCode MatSolve_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
559: {
560: PetscFunctionBegin;
561: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky));
562: PetscFunctionReturn(PETSC_SUCCESS);
563: }
565: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
566: {
567: PetscFunctionBegin;
568: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky));
569: PetscFunctionReturn(PETSC_SUCCESS);
570: }
572: static PetscErrorCode MatSolve_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
573: {
574: PetscFunctionBegin;
575: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR));
576: PetscFunctionReturn(PETSC_SUCCESS);
577: }
579: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
580: {
581: PetscFunctionBegin;
582: PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR));
583: PetscFunctionReturn(PETSC_SUCCESS);
584: }
586: static PetscErrorCode MatMatSolve_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
587: {
588: PetscFunctionBegin;
589: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU));
590: PetscFunctionReturn(PETSC_SUCCESS);
591: }
593: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
594: {
595: PetscFunctionBegin;
596: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU));
597: PetscFunctionReturn(PETSC_SUCCESS);
598: }
600: static PetscErrorCode MatMatSolve_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
601: {
602: PetscFunctionBegin;
603: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky));
604: PetscFunctionReturn(PETSC_SUCCESS);
605: }
607: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
608: {
609: PetscFunctionBegin;
610: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky));
611: PetscFunctionReturn(PETSC_SUCCESS);
612: }
614: static PetscErrorCode MatMatSolve_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
615: {
616: PetscFunctionBegin;
617: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR));
618: PetscFunctionReturn(PETSC_SUCCESS);
619: }
621: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
622: {
623: PetscFunctionBegin;
624: PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR));
625: PetscFunctionReturn(PETSC_SUCCESS);
626: }
628: static PetscErrorCode MatLUFactor_SeqDenseHIP(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
629: {
630: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
631: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
632: PetscScalar *da;
633: PetscHipBLASInt m, n, lda;
634: hipsolverHandle_t handle;
635: #if defined(PETSC_USE_DEBUG)
636: int info;
637: #endif
639: PetscFunctionBegin;
640: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
641: PetscCall(PetscHIPSOLVERGetHandle(&handle));
642: PetscCall(MatDenseHIPGetArray(A, &da));
643: PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
644: PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
645: PetscCall(PetscHipBLASIntCast(a->lda, &lda));
646: PetscCall(PetscInfo(A, "LU factor %d x %d on backend\n", m, n));
647: if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
648: if (!dA->fact_lwork) {
649: PetscCallHIPSOLVER(hipsolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
650: PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
651: }
652: if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
653: PetscCall(PetscLogGpuTimeBegin());
654: PetscCallHIPSOLVER(hipsolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_ipiv, dA->d_fact_info));
655: PetscCall(PetscLogGpuTimeEnd());
656: PetscCall(MatDenseHIPRestoreArray(A, &da));
657: #if defined(PETSC_USE_DEBUG)
658: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
659: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
660: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
661: #endif
662: A->factortype = MAT_FACTOR_LU;
663: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
665: A->ops->solve = MatSolve_SeqDenseHIP_LU;
666: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_LU;
667: A->ops->matsolve = MatMatSolve_SeqDenseHIP_LU;
668: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_LU;
670: PetscCall(PetscFree(A->solvertype));
671: PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
672: PetscFunctionReturn(PETSC_SUCCESS);
673: }
675: static PetscErrorCode MatCholeskyFactor_SeqDenseHIP(Mat A, IS perm, const MatFactorInfo *factinfo)
676: {
677: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
678: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
679: PetscScalar *da;
680: PetscHipBLASInt n, lda;
681: hipsolverHandle_t handle;
682: #if defined(PETSC_USE_DEBUG)
683: int info;
684: #endif
686: PetscFunctionBegin;
687: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
688: PetscCall(PetscHIPSOLVERGetHandle(&handle));
689: PetscCall(PetscHipBLASIntCast(A->rmap->n, &n));
690: PetscCall(PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n));
691: if (A->spd == PETSC_BOOL3_TRUE) {
692: PetscCall(MatDenseHIPGetArray(A, &da));
693: PetscCall(PetscHipBLASIntCast(a->lda, &lda));
694: if (!dA->fact_lwork) {
695: PetscCallHIPSOLVER(hipsolverDnXpotrf_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork));
696: PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
697: }
698: if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
699: PetscCall(PetscLogGpuTimeBegin());
700: PetscCallHIPSOLVER(hipsolverDnXpotrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
701: PetscCall(PetscLogGpuTimeEnd());
703: PetscCall(MatDenseHIPRestoreArray(A, &da));
704: #if defined(PETSC_USE_DEBUG)
705: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
706: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
707: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
708: #endif
709: A->factortype = MAT_FACTOR_CHOLESKY;
710: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
711: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipsolverDnsytrs unavailable. Use MAT_FACTOR_LU");
713: /* at the time of writing hipsolverDn has *sytrs and *hetr* routines implemented and the
714: code below should work */
715: if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
716: if (!dA->fact_lwork) {
717: PetscCallHIPSOLVER(hipsolverDnXsytrf_bufferSize(handle, n, da, lda, &dA->fact_lwork));
718: PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
719: }
720: if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
721: PetscCall(PetscLogGpuTimeBegin());
722: PetscCallHIPSOLVER(hipsolverDnXsytrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_ipiv, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
723: PetscCall(PetscLogGpuTimeEnd());
725: A->ops->solve = MatSolve_SeqDenseHIP_Cholesky;
726: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_Cholesky;
727: A->ops->matsolve = MatMatSolve_SeqDenseHIP_Cholesky;
728: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_Cholesky;
729: PetscCall(PetscFree(A->solvertype));
730: PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
731: PetscFunctionReturn(PETSC_SUCCESS);
732: }
734: static PetscErrorCode MatQRFactor_SeqDenseHIP(Mat A, IS col, const MatFactorInfo *factinfo)
735: {
736: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
737: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
738: PetscScalar *da;
739: PetscHipBLASInt m, min, max, n, lda;
740: hipsolverHandle_t handle;
741: #if defined(PETSC_USE_DEBUG)
742: int info;
743: #endif
745: PetscFunctionBegin;
746: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
747: PetscCall(PetscHIPSOLVERGetHandle(&handle));
748: PetscCall(MatDenseHIPGetArray(A, &da));
749: PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
750: PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
751: PetscCall(PetscHipBLASIntCast(a->lda, &lda));
752: PetscCall(PetscInfo(A, "QR factor %d x %d on backend\n", m, n));
753: max = PetscMax(m, n);
754: min = PetscMin(m, n);
755: if (!dA->d_fact_tau) PetscCallHIP(hipMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau)));
756: if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
757: if (!dA->fact_lwork) {
758: PetscCallHIPSOLVER(hipsolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
759: PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
760: }
761: if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
762: if (!dA->workvec) PetscCall(VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
763: PetscCall(PetscLogGpuTimeBegin());
764: PetscCallHIPSOLVER(hipsolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
765: PetscCall(PetscLogGpuTimeEnd());
766: PetscCall(MatDenseHIPRestoreArray(A, &da));
767: #if defined(PETSC_USE_DEBUG)
768: PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
769: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
770: #endif
771: A->factortype = MAT_FACTOR_QR;
772: a->rank = min;
773: PetscCall(PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0)));
775: A->ops->solve = MatSolve_SeqDenseHIP_QR;
776: A->ops->solvetranspose = MatSolveTranspose_SeqDenseHIP_QR;
777: A->ops->matsolve = MatMatSolve_SeqDenseHIP_QR;
778: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_QR;
780: PetscCall(PetscFree(A->solvertype));
781: PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
782: PetscFunctionReturn(PETSC_SUCCESS);
783: }
785: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
786: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
787: {
788: const PetscScalar *da, *db;
789: PetscScalar *dc;
790: PetscScalar one = 1.0, zero = 0.0;
791: PetscHipBLASInt m, n, k;
792: PetscInt alda, blda, clda;
793: hipblasHandle_t hipblasv2handle;
794: PetscBool Aiship, Biship;
796: PetscFunctionBegin;
797: /* we may end up with SEQDENSE as one of the arguments */
798: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &Aiship));
799: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &Biship));
800: if (!Aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
801: if (!Biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
802: PetscCall(PetscHipBLASIntCast(C->rmap->n, &m));
803: PetscCall(PetscHipBLASIntCast(C->cmap->n, &n));
804: if (tA) PetscCall(PetscHipBLASIntCast(A->rmap->n, &k));
805: else PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
806: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
807: PetscCall(PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n));
808: PetscCall(MatDenseHIPGetArrayRead(A, &da));
809: PetscCall(MatDenseHIPGetArrayRead(B, &db));
810: PetscCall(MatDenseHIPGetArrayWrite(C, &dc));
811: PetscCall(MatDenseGetLDA(A, &alda));
812: PetscCall(MatDenseGetLDA(B, &blda));
813: PetscCall(MatDenseGetLDA(C, &clda));
814: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
815: PetscCall(PetscLogGpuTimeBegin());
816: PetscCallHIPBLAS(hipblasXgemm(hipblasv2handle, tA ? HIPBLAS_OP_T : HIPBLAS_OP_N, tB ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda));
817: PetscCall(PetscLogGpuTimeEnd());
818: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
819: PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
820: PetscCall(MatDenseHIPRestoreArrayRead(B, &db));
821: PetscCall(MatDenseHIPRestoreArrayWrite(C, &dc));
822: if (!Aiship) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
823: if (!Biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
824: PetscFunctionReturn(PETSC_SUCCESS);
825: }
827: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
828: {
829: PetscFunctionBegin;
830: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_TRUE, PETSC_FALSE));
831: PetscFunctionReturn(PETSC_SUCCESS);
832: }
834: PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
835: {
836: PetscFunctionBegin;
837: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_FALSE));
838: PetscFunctionReturn(PETSC_SUCCESS);
839: }
841: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
842: {
843: PetscFunctionBegin;
844: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_TRUE));
845: PetscFunctionReturn(PETSC_SUCCESS);
846: }
848: PetscErrorCode MatProductSetFromOptions_SeqDenseHIP(Mat C)
849: {
850: PetscFunctionBegin;
851: PetscCall(MatProductSetFromOptions_SeqDense(C));
852: PetscFunctionReturn(PETSC_SUCCESS);
853: }
855: /* zz = op(A)*xx + yy
856: if yy == NULL, only MatMult */
857: static PetscErrorCode MatMultAdd_SeqDenseHIP_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
858: {
859: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
860: const PetscScalar *xarray, *da;
861: PetscScalar *zarray;
862: PetscScalar one = 1.0, zero = 0.0;
863: PetscHipBLASInt m, n, lda;
864: hipblasHandle_t hipblasv2handle;
866: PetscFunctionBegin;
867: if (yy && yy != zz) PetscCall(VecSeq_HIP::copy(yy, zz)); /* mult add */
868: if (!A->rmap->n || !A->cmap->n) {
869: if (!yy) PetscCall(VecSeq_HIP::set(zz, 0.0)); /* mult only */
870: PetscFunctionReturn(PETSC_SUCCESS);
871: }
872: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n));
873: PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
874: PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
875: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
876: PetscCall(MatDenseHIPGetArrayRead(A, &da));
877: PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
878: PetscCall(VecHIPGetArrayRead(xx, &xarray));
879: PetscCall(VecHIPGetArray(zz, &zarray));
880: PetscCall(PetscLogGpuTimeBegin());
881: PetscCallHIPBLAS(hipblasXgemv(hipblasv2handle, trans ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1));
882: PetscCall(PetscLogGpuTimeEnd());
883: PetscCall(PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n)));
884: PetscCall(VecHIPRestoreArrayRead(xx, &xarray));
885: PetscCall(VecHIPRestoreArray(zz, &zarray));
886: PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
887: PetscFunctionReturn(PETSC_SUCCESS);
888: }
890: PetscErrorCode MatMultAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
891: {
892: PetscFunctionBegin;
893: PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_FALSE));
894: PetscFunctionReturn(PETSC_SUCCESS);
895: }
897: PetscErrorCode MatMultTransposeAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
898: {
899: PetscFunctionBegin;
900: PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_TRUE));
901: PetscFunctionReturn(PETSC_SUCCESS);
902: }
904: PetscErrorCode MatMult_SeqDenseHIP(Mat A, Vec xx, Vec yy)
905: {
906: PetscFunctionBegin;
907: PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_FALSE));
908: PetscFunctionReturn(PETSC_SUCCESS);
909: }
911: PetscErrorCode MatMultTranspose_SeqDenseHIP(Mat A, Vec xx, Vec yy)
912: {
913: PetscFunctionBegin;
914: PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_TRUE));
915: PetscFunctionReturn(PETSC_SUCCESS);
916: }
918: static PetscErrorCode MatDenseGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **array)
919: {
920: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
922: PetscFunctionBegin;
923: PetscCall(MatSeqDenseHIPCopyFromGPU(A));
924: *array = mat->v;
925: PetscFunctionReturn(PETSC_SUCCESS);
926: }
928: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **array)
929: {
930: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
932: PetscFunctionBegin;
933: if (!mat->v) PetscCall(MatSeqDenseSetPreallocation(A, NULL)); /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
934: *array = mat->v;
935: A->offloadmask = PETSC_OFFLOAD_CPU;
936: PetscFunctionReturn(PETSC_SUCCESS);
937: }
939: static PetscErrorCode MatDenseGetArray_SeqDenseHIP(Mat A, PetscScalar **array)
940: {
941: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
943: PetscFunctionBegin;
944: PetscCall(MatSeqDenseHIPCopyFromGPU(A));
945: *array = mat->v;
946: A->offloadmask = PETSC_OFFLOAD_CPU;
947: PetscFunctionReturn(PETSC_SUCCESS);
948: }
950: static PetscErrorCode MatDenseGetArrayAndMemType_SeqDenseHIP(Mat A, PetscScalar **array, PetscMemType *mtype)
951: {
952: const auto dA = static_cast<Mat_SeqDenseHIP *>(A->spptr);
954: PetscFunctionBegin;
955: PetscCall(MatSeqDenseHIPCopyToGPU(A)); // Since we will read the array on device, we sync the GPU data if necessary
956: *array = dA->d_v;
957: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
958: PetscFunctionReturn(PETSC_SUCCESS);
959: }
961: static PetscErrorCode MatDenseRestoreArrayAndMemType_SeqDenseHIP(Mat A, PetscScalar **array)
962: {
963: PetscFunctionBegin;
964: *array = nullptr;
965: A->offloadmask = PETSC_OFFLOAD_GPU; // Since we've written to the array on device
966: PetscFunctionReturn(PETSC_SUCCESS);
967: }
969: static PetscErrorCode (*MatDenseGetArrayReadAndMemType_SeqDenseHIP)(Mat, PetscScalar **, PetscMemType *) = MatDenseGetArrayAndMemType_SeqDenseHIP;
970: static PetscErrorCode (*MatDenseRestoreArrayReadAndMemType_SeqDenseHIP)(Mat, PetscScalar **) = nullptr; // Keep the offload mask as is
972: static PetscErrorCode MatDenseGetArrayWriteAndMemType_SeqDenseHIP(Mat A, PetscScalar **array, PetscMemType *mtype)
973: {
974: const auto dA = static_cast<Mat_SeqDenseHIP *>(A->spptr);
976: PetscFunctionBegin;
977: if (!dA->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL)); // Allocate GPU memory if not present
978: *array = dA->d_v;
979: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
980: PetscFunctionReturn(PETSC_SUCCESS);
981: }
983: static PetscErrorCode (*MatDenseRestoreArrayWriteAndMemType_SeqDenseHIP)(Mat, PetscScalar **) = MatDenseRestoreArrayAndMemType_SeqDenseHIP; // Since we've written to the array on device
985: PetscErrorCode MatScale_SeqDenseHIP(Mat Y, PetscScalar alpha)
986: {
987: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
988: PetscScalar *dy;
989: PetscHipBLASInt j, N, m, lday, one = 1;
990: hipblasHandle_t hipblasv2handle;
992: PetscFunctionBegin;
993: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
994: PetscCall(MatDenseHIPGetArray(Y, &dy));
995: PetscCall(PetscHipBLASIntCast(Y->rmap->n * Y->cmap->n, &N));
996: PetscCall(PetscHipBLASIntCast(Y->rmap->n, &m));
997: PetscCall(PetscHipBLASIntCast(y->lda, &lday));
998: PetscCall(PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
999: PetscCall(PetscLogGpuTimeBegin());
1000: if (lday > m) {
1001: for (j = 0; j < Y->cmap->n; j++) PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, m, &alpha, dy + lday * j, one));
1002: } else PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, N, &alpha, dy, one));
1003: PetscCall(PetscLogGpuTimeEnd());
1004: PetscCall(PetscLogGpuFlops(N));
1005: PetscCall(MatDenseHIPRestoreArray(Y, &dy));
1006: PetscFunctionReturn(PETSC_SUCCESS);
1007: }
1009: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1010: const PetscScalar shift_;
1011: petscshift(PetscScalar shift) : shift_(shift) { }
1012: __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1013: };
1015: template <typename Iterator>
1016: class strided_range {
1017: public:
1018: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1019: struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1020: difference_type stride;
1021: stride_functor(difference_type stride) : stride(stride) { }
1022: __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1023: };
1024: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
1025: typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1026: typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
1027: typedef PermutationIterator iterator; // type of the strided_range iterator
1028: // construct strided_range for the range [first,last)
1029: strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1030: iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1031: iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }
1033: protected:
1034: Iterator first;
1035: Iterator last;
1036: difference_type stride;
1037: };
1039: PetscErrorCode MatShift_DenseHIP_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1040: {
1041: PetscFunctionBegin;
1042: PetscInt rend2 = PetscMin(rend, cols);
1043: if (rend2 > rstart) {
1044: PetscCall(PetscLogGpuTimeBegin());
1045: try {
1046: const auto dptr = thrust::device_pointer_cast(da);
1047: size_t begin = rstart * lda;
1048: size_t end = rend2 - rstart + rend2 * lda;
1049: strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1050: thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1051: } catch (char *ex) {
1052: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1053: }
1054: PetscCall(PetscLogGpuTimeEnd());
1055: PetscCall(PetscLogGpuFlops(rend2 - rstart));
1056: }
1057: PetscFunctionReturn(PETSC_SUCCESS);
1058: }
1060: PetscErrorCode MatShift_SeqDenseHIP(Mat A, PetscScalar alpha)
1061: {
1062: PetscScalar *da;
1063: PetscInt m = A->rmap->n, n = A->cmap->n, lda;
1065: PetscFunctionBegin;
1066: PetscCall(MatDenseHIPGetArray(A, &da));
1067: PetscCall(MatDenseGetLDA(A, &lda));
1068: PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1069: PetscCall(MatShift_DenseHIP_Private(da, alpha, lda, 0, m, n));
1070: PetscCall(MatDenseHIPRestoreArray(A, &da));
1071: PetscFunctionReturn(PETSC_SUCCESS);
1072: }
1074: PetscErrorCode MatAXPY_SeqDenseHIP(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1075: {
1076: Mat_SeqDense *x = (Mat_SeqDense *)X->data;
1077: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1078: const PetscScalar *dx;
1079: PetscScalar *dy;
1080: PetscHipBLASInt j, N, m, ldax, lday, one = 1;
1081: hipblasHandle_t hipblasv2handle;
1083: PetscFunctionBegin;
1084: if (!X->rmap->n || !X->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1085: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
1086: PetscCall(MatDenseHIPGetArrayRead(X, &dx));
1087: if (alpha == 0.0) PetscCall(MatDenseHIPGetArrayWrite(Y, &dy));
1088: else PetscCall(MatDenseHIPGetArray(Y, &dy));
1089: PetscCall(PetscHipBLASIntCast(X->rmap->n * X->cmap->n, &N));
1090: PetscCall(PetscHipBLASIntCast(X->rmap->n, &m));
1091: PetscCall(PetscHipBLASIntCast(x->lda, &ldax));
1092: PetscCall(PetscHipBLASIntCast(y->lda, &lday));
1093: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1094: PetscCall(PetscLogGpuTimeBegin());
1095: if (ldax > m || lday > m) {
1096: for (j = 0; j < X->cmap->n; j++) PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one));
1097: } else PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, N, &alpha, dx, one, dy, one));
1098: PetscCall(PetscLogGpuTimeEnd());
1099: PetscCall(PetscLogGpuFlops(PetscMax(2. * N - 1, 0)));
1100: PetscCall(MatDenseHIPRestoreArrayRead(X, &dx));
1101: if (alpha == 0.0) PetscCall(MatDenseHIPRestoreArrayWrite(Y, &dy));
1102: else PetscCall(MatDenseHIPRestoreArray(Y, &dy));
1103: PetscFunctionReturn(PETSC_SUCCESS);
1104: }
1106: static PetscErrorCode MatReset_SeqDenseHIP(Mat A)
1107: {
1108: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1110: PetscFunctionBegin;
1111: if (dA) {
1112: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
1113: if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
1114: PetscCallHIP(hipFree(dA->d_fact_tau));
1115: PetscCallHIP(hipFree(dA->d_fact_ipiv));
1116: PetscCallHIP(hipFree(dA->d_fact_info));
1117: PetscCallHIP(hipFree(dA->d_fact_work));
1118: PetscCall(VecDestroy(&dA->workvec));
1119: }
1120: PetscCall(PetscFree(A->spptr));
1121: PetscFunctionReturn(PETSC_SUCCESS);
1122: }
1124: PetscErrorCode MatDestroy_SeqDenseHIP(Mat A)
1125: {
1126: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1128: PetscFunctionBegin;
1129: /* prevent to copy back data if we own the data pointer */
1130: if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1131: PetscCall(MatConvert_SeqDenseHIP_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1132: PetscCall(MatDestroy_SeqDense(A));
1133: PetscFunctionReturn(PETSC_SUCCESS);
1134: }
1136: PetscErrorCode MatDuplicate_SeqDenseHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
1137: {
1138: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1140: PetscFunctionBegin;
1141: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1142: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1143: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
1144: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues));
1145: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) PetscCall(MatCopy_SeqDenseHIP(A, *B, SAME_NONZERO_PATTERN));
1146: if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1147: Mat_SeqDenseHIP *dB = (Mat_SeqDenseHIP *)(*B)->spptr;
1148: if (!dB->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(*B, NULL));
1149: }
1150: PetscFunctionReturn(PETSC_SUCCESS);
1151: }
1153: static PetscErrorCode MatGetColumnVector_SeqDenseHIP(Mat A, Vec v, PetscInt col)
1154: {
1155: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1156: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1157: PetscScalar *x;
1158: PetscBool viship;
1160: PetscFunctionBegin;
1161: PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viship, VECSEQHIP, VECMPIHIP, VECHIP, ""));
1162: if (viship && !v->boundtocpu) { /* update device data */
1163: PetscCall(VecHIPGetArrayWrite(v, &x));
1164: if (A->offloadmask & PETSC_OFFLOAD_GPU) PetscCallHIP(hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToHost));
1165: else PetscCallHIP(hipMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
1166: PetscCall(VecHIPRestoreArrayWrite(v, &x));
1167: } else { /* update host data */
1168: PetscCall(VecGetArrayWrite(v, &x));
1169: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) PetscCall(PetscArraycpy(x, a->v + col * a->lda, A->rmap->n));
1170: else if (A->offloadmask & PETSC_OFFLOAD_GPU) PetscCallHIP(hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1171: PetscCall(VecRestoreArrayWrite(v, &x));
1172: }
1173: PetscFunctionReturn(PETSC_SUCCESS);
1174: }
1176: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_hip(Mat A, MatFactorType ftype, Mat *fact)
1177: {
1178: PetscFunctionBegin;
1179: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), fact));
1180: PetscCall(MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1181: PetscCall(MatSetType(*fact, MATSEQDENSEHIP));
1182: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1183: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1184: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1185: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1186: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1187: } else if (ftype == MAT_FACTOR_QR) {
1188: PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense));
1189: PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1190: }
1191: (*fact)->factortype = ftype;
1192: PetscCall(PetscFree((*fact)->solvertype));
1193: PetscCall(PetscStrallocpy(MATSOLVERHIP, &(*fact)->solvertype));
1194: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]));
1195: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]));
1196: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]));
1197: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]));
1198: PetscFunctionReturn(PETSC_SUCCESS);
1199: }
1201: static PetscErrorCode MatDenseGetColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1202: {
1203: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1205: PetscFunctionBegin;
1206: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1207: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1208: PetscCall(MatDenseHIPGetArray(A, (PetscScalar **)&a->ptrinuse));
1209: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1210: PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1211: }
1212: a->vecinuse = col + 1;
1213: PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1214: *v = a->cvec;
1215: PetscFunctionReturn(PETSC_SUCCESS);
1216: }
1218: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1219: {
1220: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1222: PetscFunctionBegin;
1223: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1224: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1225: a->vecinuse = 0;
1226: PetscCall(VecHIPResetArray(a->cvec));
1227: PetscCall(MatDenseHIPRestoreArray(A, (PetscScalar **)&a->ptrinuse));
1228: if (v) *v = NULL;
1229: PetscFunctionReturn(PETSC_SUCCESS);
1230: }
1232: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1233: {
1234: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1236: PetscFunctionBegin;
1237: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1238: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1239: PetscCall(MatDenseHIPGetArrayRead(A, &a->ptrinuse));
1240: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1241: PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1242: }
1243: a->vecinuse = col + 1;
1244: PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1245: PetscCall(VecLockReadPush(a->cvec));
1246: *v = a->cvec;
1247: PetscFunctionReturn(PETSC_SUCCESS);
1248: }
1250: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1251: {
1252: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1254: PetscFunctionBegin;
1255: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1256: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1257: a->vecinuse = 0;
1258: PetscCall(VecLockReadPop(a->cvec));
1259: PetscCall(VecHIPResetArray(a->cvec));
1260: PetscCall(MatDenseHIPRestoreArrayRead(A, &a->ptrinuse));
1261: if (v) *v = NULL;
1262: PetscFunctionReturn(PETSC_SUCCESS);
1263: }
1265: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1266: {
1267: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1269: PetscFunctionBegin;
1270: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1271: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1272: PetscCall(MatDenseHIPGetArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1273: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1274: PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1275: }
1276: a->vecinuse = col + 1;
1277: PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1278: *v = a->cvec;
1279: PetscFunctionReturn(PETSC_SUCCESS);
1280: }
1282: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1283: {
1284: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1286: PetscFunctionBegin;
1287: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1288: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1289: a->vecinuse = 0;
1290: PetscCall(VecHIPResetArray(a->cvec));
1291: PetscCall(MatDenseHIPRestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1292: if (v) *v = NULL;
1293: PetscFunctionReturn(PETSC_SUCCESS);
1294: }
1296: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseHIP(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1297: {
1298: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1299: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1301: PetscFunctionBegin;
1302: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1303: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1304: if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) PetscCall(MatDestroy(&a->cmat));
1305: PetscCall(MatSeqDenseHIPCopyToGPU(A));
1306: if (!a->cmat) {
1307: PetscCall(MatCreateDenseHIP(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat));
1308: } else PetscCall(MatDenseHIPPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda));
1309: PetscCall(MatDenseSetLDA(a->cmat, a->lda));
1310: /* Place CPU array if present but not copy any data */
1311: a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1312: if (a->v) { PetscCall(MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda)); }
1313: a->cmat->offloadmask = A->offloadmask;
1314: a->matinuse = cbegin + 1;
1315: *v = a->cmat;
1316: PetscFunctionReturn(PETSC_SUCCESS);
1317: }
1319: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseHIP(Mat A, Mat *v)
1320: {
1321: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1322: PetscBool copy = PETSC_FALSE, reset;
1323: PetscOffloadMask suboff;
1325: PetscFunctionBegin;
1326: PetscCheck(a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1327: PetscCheck(a->cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1328: PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1329: a->matinuse = 0;
1330: reset = a->v ? PETSC_TRUE : PETSC_FALSE;
1331: suboff = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1332: if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1333: copy = PETSC_TRUE;
1334: PetscCall(MatSeqDenseSetPreallocation(A, NULL));
1335: }
1336: PetscCall(MatDenseHIPResetArray(a->cmat));
1337: if (reset) PetscCall(MatDenseResetArray(a->cmat));
1338: if (copy) {
1339: PetscCall(MatSeqDenseHIPCopyFromGPU(A));
1340: } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1341: a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1342: if (v) *v = NULL;
1343: PetscFunctionReturn(PETSC_SUCCESS);
1344: }
1346: static PetscErrorCode MatDenseSetLDA_SeqDenseHIP(Mat A, PetscInt lda)
1347: {
1348: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
1349: Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1350: PetscBool data;
1352: PetscFunctionBegin;
1353: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1354: PetscCheck(dA->user_alloc || data || cA->lda == lda, PETSC_COMM_SELF, PETSC_ERR_ORDER, "LDA cannot be changed after allocation of internal storage");
1355: PetscCheck(lda >= A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "LDA %" PetscInt_FMT " must be at least matrix dimension %" PetscInt_FMT, lda, A->rmap->n);
1356: cA->lda = lda;
1357: PetscFunctionReturn(PETSC_SUCCESS);
1358: }
1360: static PetscErrorCode MatSetUp_SeqDenseHIP(Mat A)
1361: {
1362: PetscFunctionBegin;
1363: PetscCall(PetscLayoutSetUp(A->rmap));
1364: PetscCall(PetscLayoutSetUp(A->cmap));
1365: if (!A->preallocated) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
1366: PetscFunctionReturn(PETSC_SUCCESS);
1367: }
1369: static PetscErrorCode MatBindToCPU_SeqDenseHIP(Mat A, PetscBool flg)
1370: {
1371: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1373: PetscFunctionBegin;
1374: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1375: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1376: A->boundtocpu = flg;
1377: if (!flg) {
1378: PetscBool iship;
1380: PetscCall(PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQHIP, &iship));
1381: if (!iship) PetscCall(VecDestroy(&a->cvec));
1382: PetscCall(PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSEHIP, &iship));
1383: if (!iship) PetscCall(MatDestroy(&a->cmat));
1384: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseHIP));
1385: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseHIP));
1386: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseHIP));
1387: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseHIP));
1388: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseHIP));
1389: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseHIP));
1390: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseHIP));
1391: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseHIP));
1392: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseHIP));
1393: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseHIP));
1394: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseHIP));
1395: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseHIP));
1396: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseHIP));
1398: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", MatDenseGetArrayAndMemType_SeqDenseHIP));
1399: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", MatDenseRestoreArrayAndMemType_SeqDenseHIP));
1400: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", MatDenseGetArrayReadAndMemType_SeqDenseHIP));
1401: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", MatDenseRestoreArrayReadAndMemType_SeqDenseHIP));
1402: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", MatDenseGetArrayWriteAndMemType_SeqDenseHIP));
1403: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", MatDenseRestoreArrayWriteAndMemType_SeqDenseHIP));
1405: A->ops->duplicate = MatDuplicate_SeqDenseHIP;
1406: A->ops->mult = MatMult_SeqDenseHIP;
1407: A->ops->multadd = MatMultAdd_SeqDenseHIP;
1408: A->ops->multtranspose = MatMultTranspose_SeqDenseHIP;
1409: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseHIP;
1410: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1411: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP;
1412: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1413: A->ops->axpy = MatAXPY_SeqDenseHIP;
1414: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseHIP;
1415: A->ops->lufactor = MatLUFactor_SeqDenseHIP;
1416: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseHIP;
1417: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseHIP;
1418: A->ops->scale = MatScale_SeqDenseHIP;
1419: A->ops->shift = MatShift_SeqDenseHIP;
1420: A->ops->copy = MatCopy_SeqDenseHIP;
1421: A->ops->zeroentries = MatZeroEntries_SeqDenseHIP;
1422: A->ops->setup = MatSetUp_SeqDenseHIP;
1423: } else {
1424: /* make sure we have an up-to-date copy on the CPU */
1425: PetscCall(MatSeqDenseHIPCopyFromGPU(A));
1426: PetscCall(PetscFree(A->defaultrandtype));
1427: PetscCall(PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype));
1428: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense));
1429: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense));
1430: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense));
1431: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense));
1432: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense));
1433: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense));
1434: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense));
1435: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense));
1436: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense));
1437: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense));
1438: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense));
1439: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1440: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense));
1442: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", NULL));
1443: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", NULL));
1444: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", NULL));
1445: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", NULL));
1446: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", NULL));
1447: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", NULL));
1449: A->ops->duplicate = MatDuplicate_SeqDense;
1450: A->ops->mult = MatMult_SeqDense;
1451: A->ops->multadd = MatMultAdd_SeqDense;
1452: A->ops->multtranspose = MatMultTranspose_SeqDense;
1453: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1454: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1455: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1456: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1457: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1458: A->ops->axpy = MatAXPY_SeqDense;
1459: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1460: A->ops->lufactor = MatLUFactor_SeqDense;
1461: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1462: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1463: A->ops->scale = MatScale_SeqDense;
1464: A->ops->shift = MatShift_SeqDense;
1465: A->ops->copy = MatCopy_SeqDense;
1466: A->ops->zeroentries = MatZeroEntries_SeqDense;
1467: A->ops->setup = MatSetUp_SeqDense;
1468: A->ops->setrandom = MatSetRandom_SeqDense;
1469: }
1470: if (a->cmat) PetscCall(MatBindToCPU(a->cmat, flg));
1471: PetscFunctionReturn(PETSC_SUCCESS);
1472: }
1474: PetscErrorCode MatConvert_SeqDenseHIP_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1475: {
1476: Mat B;
1477: Mat_SeqDense *a;
1479: PetscFunctionBegin;
1480: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1481: /* TODO these cases should be optimized */
1482: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1483: PetscFunctionReturn(PETSC_SUCCESS);
1484: }
1486: B = *newmat;
1487: PetscCall(MatBindToCPU_SeqDenseHIP(B, PETSC_TRUE));
1488: PetscCall(MatReset_SeqDenseHIP(B));
1489: PetscCall(PetscFree(B->defaultvectype));
1490: PetscCall(PetscStrallocpy(VECSTANDARD, &B->defaultvectype));
1491: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE));
1492: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", NULL));
1493: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", NULL));
1494: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", NULL));
1495: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", NULL));
1496: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", NULL));
1497: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", NULL));
1498: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", NULL));
1499: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", NULL));
1500: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", NULL));
1501: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", NULL));
1502: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", NULL));
1503: a = (Mat_SeqDense *)B->data;
1504: PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQHIP. Destroy it and rebuild a VECSEQ when needed */
1505: B->ops->bindtocpu = NULL;
1506: B->ops->destroy = MatDestroy_SeqDense;
1507: B->offloadmask = PETSC_OFFLOAD_CPU;
1508: PetscFunctionReturn(PETSC_SUCCESS);
1509: }
1511: PetscErrorCode MatConvert_SeqDense_SeqDenseHIP(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1512: {
1513: Mat_SeqDenseHIP *dB;
1514: Mat_SeqDense *a;
1515: Mat B;
1517: PetscFunctionBegin;
1518: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
1519: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1520: /* TODO these cases should be optimized */
1521: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1522: PetscFunctionReturn(PETSC_SUCCESS);
1523: }
1525: B = *newmat;
1526: PetscCall(PetscFree(B->defaultvectype));
1527: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
1528: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSEHIP));
1529: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", MatConvert_SeqDenseHIP_SeqDense));
1530: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", MatDenseHIPGetArray_SeqDenseHIP));
1531: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", MatDenseHIPGetArrayRead_SeqDenseHIP));
1532: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", MatDenseHIPGetArrayWrite_SeqDenseHIP));
1533: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", MatDenseHIPRestoreArray_SeqDenseHIP));
1534: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", MatDenseHIPRestoreArrayRead_SeqDenseHIP));
1535: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", MatDenseHIPRestoreArrayWrite_SeqDenseHIP));
1536: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", MatDenseHIPPlaceArray_SeqDenseHIP));
1537: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", MatDenseHIPResetArray_SeqDenseHIP));
1538: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", MatDenseHIPReplaceArray_SeqDenseHIP));
1539: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
1540: a = (Mat_SeqDense *)B->data;
1541: PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQHIP when needed */
1542: PetscCall(PetscNew(&dB));
1543: B->spptr = dB;
1544: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1545: PetscCall(MatBindToCPU_SeqDenseHIP(B, PETSC_FALSE));
1546: B->ops->bindtocpu = MatBindToCPU_SeqDenseHIP;
1547: B->ops->destroy = MatDestroy_SeqDenseHIP;
1548: PetscFunctionReturn(PETSC_SUCCESS);
1549: }
1551: /*@C
1552: MatCreateSeqDenseHIP - Creates a sequential matrix in dense format using HIP.
1554: Collective
1556: Input Parameters:
1557: + comm - MPI communicator
1558: . m - number of rows
1559: . n - number of columns
1560: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1561: to control matrix memory allocation.
1563: Output Parameter:
1564: . A - the matrix
1566: Notes:
1568: Level: intermediate
1570: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1571: @*/
1572: PetscErrorCode MatCreateSeqDenseHIP(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1573: {
1574: PetscMPIInt size;
1576: PetscFunctionBegin;
1577: PetscCallMPI(MPI_Comm_size(comm, &size));
1578: PetscCheck(size <= 1, comm, PETSC_ERR_ARG_WRONG, "Invalid communicator size %d", size);
1579: PetscCall(MatCreate(comm, A));
1580: PetscCall(MatSetSizes(*A, m, n, m, n));
1581: PetscCall(MatSetType(*A, MATSEQDENSEHIP));
1582: PetscCall(MatSeqDenseHIPSetPreallocation(*A, data));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: /*MC
1587: MATSEQDENSEHIP - MATSEQDENSEHIP = "seqdensehip" - A matrix type to be used for sequential dense matrices on GPUs.
1589: Options Database Keys:
1590: . -mat_type seqdensehip - sets the matrix type to `MATSEQDENSEHIP` during a call to `MatSetFromOptions()`
1592: Level: beginner
1594: .seealso: `MATSEQDENSE`
1595: M*/
1596: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseHIP(Mat B)
1597: {
1598: PetscFunctionBegin;
1599: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
1600: PetscCall(MatCreate_SeqDense(B));
1601: PetscCall(MatConvert_SeqDense_SeqDenseHIP(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
1602: PetscFunctionReturn(PETSC_SUCCESS);
1603: }