Actual source code: densehip.hip.cpp

  1: /*
  2:      Defines the matrix operations for sequential dense with HIP
  3:      Portions of this code are under:
  4:      Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  5: */
  6: #include <petscpkg_version.h>
  7: #include <../src/mat/impls/dense/seq/dense.h>
  8: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
  9: #include <petsc/private/petsclegacycupmblas.h>
 10: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
 11:   #include <hipsolver/hipsolver.h>
 12: #else
 13:   #include <hipsolver.h>
 14: #endif
 15: #include <thrust/device_ptr.h>
 16: #include <thrust/functional.h>
 17: #include <thrust/iterator/counting_iterator.h>
 18: #include <thrust/iterator/transform_iterator.h>
 19: #include <thrust/iterator/permutation_iterator.h>
 20: #include <thrust/transform.h>
 21: #include <thrust/device_vector.h>

 23: using VecSeq_HIP = Petsc::vec::cupm::impl::VecSeq_CUPM<Petsc::device::cupm::DeviceType::HIP>;

 25: typedef struct {
 26:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 27:   PetscBool    user_alloc;
 28:   PetscScalar *unplacedarray; /* if one called MatHIPDensePlaceArray(), this is where it stashed the original */
 29:   PetscBool    unplaced_user_alloc;
 30:   /* factorization support */
 31:   PetscHipBLASInt *d_fact_ipiv; /* device pivots */
 32:   PetscScalar     *d_fact_tau;  /* device QR tau vector */
 33:   PetscScalar     *d_fact_work; /* device workspace */
 34:   PetscHipBLASInt  fact_lwork;
 35:   PetscHipBLASInt *d_fact_info; /* device info */
 36:   /* workspace */
 37:   Vec workvec;
 38: } Mat_SeqDenseHIP;

 40: PetscErrorCode MatSeqDenseHIPSetPreallocation(Mat A, PetscScalar *d_data)
 41: {
 42:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
 43:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
 44:   PetscBool        iship;

 46:   PetscFunctionBegin;
 47:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &iship));
 48:   if (!iship) PetscFunctionReturn(PETSC_SUCCESS);
 49:   /* it may happen CPU preallocation has not been performed */
 50:   PetscCall(PetscLayoutSetUp(A->rmap));
 51:   PetscCall(PetscLayoutSetUp(A->cmap));
 52:   if (cA->lda <= 0) cA->lda = A->rmap->n;
 53:   if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
 54:   if (!d_data) { /* petsc-allocated storage */
 55:     size_t sz;
 56:     PetscCall(PetscIntMultError(cA->lda, A->cmap->n, NULL));
 57:     sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
 58:     PetscCallHIP(hipMalloc((void **)&dA->d_v, sz));
 59:     PetscCallHIP(hipMemset(dA->d_v, 0, sz));
 60:     dA->user_alloc = PETSC_FALSE;
 61:   } else { /* user-allocated storage */
 62:     dA->d_v        = d_data;
 63:     dA->user_alloc = PETSC_TRUE;
 64:   }
 65:   A->offloadmask  = PETSC_OFFLOAD_GPU;
 66:   A->preallocated = PETSC_TRUE;
 67:   A->assembled    = PETSC_TRUE;
 68:   PetscFunctionReturn(PETSC_SUCCESS);
 69: }

 71: PetscErrorCode MatSeqDenseHIPCopyFromGPU(Mat A)
 72: {
 73:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
 74:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

 76:   PetscFunctionBegin;
 77:   PetscCheckTypeName(A, MATSEQDENSEHIP);
 78:   PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
 79:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
 80:     if (!cA->v) { /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
 81:       PetscCall(MatSeqDenseSetPreallocation(A, NULL));
 82:     }
 83:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0));
 84:     if (cA->lda > A->rmap->n) {
 85:       PetscCallHIP(hipMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyDeviceToHost));
 86:     } else {
 87:       PetscCallHIP(hipMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyDeviceToHost));
 88:     }
 89:     PetscCall(PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
 90:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0));

 92:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 93:   }
 94:   PetscFunctionReturn(PETSC_SUCCESS);
 95: }

 97: PetscErrorCode MatSeqDenseHIPCopyToGPU(Mat A)
 98: {
 99:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
100:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
101:   PetscBool        copy;

103:   PetscFunctionBegin;
104:   PetscCheckTypeName(A, MATSEQDENSEHIP);
105:   if (A->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
106:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
107:   PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
108:   if (copy) {
109:     if (!dA->d_v) { /* Allocate GPU memory if not present */
110:       PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
111:     }
112:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0));
113:     if (cA->lda > A->rmap->n) {
114:       PetscCallHIP(hipMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, hipMemcpyHostToDevice));
115:     } else {
116:       PetscCallHIP(hipMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, hipMemcpyHostToDevice));
117:     }
118:     PetscCall(PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
119:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0));

121:     A->offloadmask = PETSC_OFFLOAD_BOTH;
122:   }
123:   PetscFunctionReturn(PETSC_SUCCESS);
124: }

126: static PetscErrorCode MatCopy_SeqDenseHIP(Mat A, Mat B, MatStructure str)
127: {
128:   const PetscScalar *va;
129:   PetscScalar       *vb;
130:   PetscInt           lda1, lda2, m = A->rmap->n, n = A->cmap->n;

132:   PetscFunctionBegin;
133:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
134:   if (A->ops->copy != B->ops->copy) {
135:     PetscCall(MatCopy_Basic(A, B, str));
136:     PetscFunctionReturn(PETSC_SUCCESS);
137:   }
138:   PetscCheck(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
139:   PetscCall(MatDenseHIPGetArrayRead(A, &va));
140:   PetscCall(MatDenseHIPGetArrayWrite(B, &vb));
141:   PetscCall(MatDenseGetLDA(A, &lda1));
142:   PetscCall(MatDenseGetLDA(B, &lda2));
143:   PetscCall(PetscLogGpuTimeBegin());
144:   if (lda1 > m || lda2 > m) {
145:     PetscCallHIP(hipMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, hipMemcpyDeviceToDevice));
146:   } else {
147:     PetscCallHIP(hipMemcpy(vb, va, m * (n * sizeof(PetscScalar)), hipMemcpyDeviceToDevice));
148:   }
149:   PetscCall(PetscLogGpuTimeEnd());
150:   PetscCall(MatDenseHIPRestoreArrayWrite(B, &vb));
151:   PetscCall(MatDenseHIPRestoreArrayRead(A, &va));
152:   PetscFunctionReturn(PETSC_SUCCESS);
153: }

155: static PetscErrorCode MatZeroEntries_SeqDenseHIP(Mat A)
156: {
157:   PetscScalar *va;
158:   PetscInt     lda, m = A->rmap->n, n = A->cmap->n;

160:   PetscFunctionBegin;
161:   PetscCall(MatDenseHIPGetArrayWrite(A, &va));
162:   PetscCall(MatDenseGetLDA(A, &lda));
163:   PetscCall(PetscLogGpuTimeBegin());
164:   if (lda > m) {
165:     PetscCallHIP(hipMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n));
166:   } else {
167:     PetscCallHIP(hipMemset(va, 0, m * (n * sizeof(PetscScalar))));
168:   }
169:   PetscCallHIP(WaitForHIP());
170:   PetscCall(PetscLogGpuTimeEnd());
171:   PetscCall(MatDenseHIPRestoreArrayWrite(A, &va));
172:   PetscFunctionReturn(PETSC_SUCCESS);
173: }

175: static PetscErrorCode MatDenseHIPPlaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
176: {
177:   Mat_SeqDense    *aa = (Mat_SeqDense *)A->data;
178:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

180:   PetscFunctionBegin;
181:   PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
182:   PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
183:   PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
184:   if (aa->v) PetscCall(MatSeqDenseHIPCopyToGPU(A));
185:   dA->unplacedarray       = dA->d_v;
186:   dA->unplaced_user_alloc = dA->user_alloc;
187:   dA->d_v                 = (PetscScalar *)a;
188:   dA->user_alloc          = PETSC_TRUE;
189:   PetscFunctionReturn(PETSC_SUCCESS);
190: }

192: static PetscErrorCode MatDenseHIPResetArray_SeqDenseHIP(Mat A)
193: {
194:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
195:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

197:   PetscFunctionBegin;
198:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
199:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
200:   if (a->v) PetscCall(MatSeqDenseHIPCopyToGPU(A));
201:   dA->d_v           = dA->unplacedarray;
202:   dA->user_alloc    = dA->unplaced_user_alloc;
203:   dA->unplacedarray = NULL;
204:   PetscFunctionReturn(PETSC_SUCCESS);
205: }

207: static PetscErrorCode MatDenseHIPReplaceArray_SeqDenseHIP(Mat A, const PetscScalar *a)
208: {
209:   Mat_SeqDense    *aa = (Mat_SeqDense *)A->data;
210:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

212:   PetscFunctionBegin;
213:   PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
214:   PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
215:   PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
216:   if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
217:   dA->d_v        = (PetscScalar *)a;
218:   dA->user_alloc = PETSC_FALSE;
219:   PetscFunctionReturn(PETSC_SUCCESS);
220: }

222: static PetscErrorCode MatDenseHIPGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
223: {
224:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

226:   PetscFunctionBegin;
227:   if (!dA->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
228:   *a = dA->d_v;
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: static PetscErrorCode MatDenseHIPRestoreArrayWrite_SeqDenseHIP(Mat A, PetscScalar **a)
233: {
234:   PetscFunctionBegin;
235:   if (a) *a = NULL;
236:   PetscFunctionReturn(PETSC_SUCCESS);
237: }

239: static PetscErrorCode MatDenseHIPGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
240: {
241:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

243:   PetscFunctionBegin;
244:   PetscCall(MatSeqDenseHIPCopyToGPU(A));
245:   *a = dA->d_v;
246:   PetscFunctionReturn(PETSC_SUCCESS);
247: }

249: static PetscErrorCode MatDenseHIPRestoreArrayRead_SeqDenseHIP(Mat A, const PetscScalar **a)
250: {
251:   PetscFunctionBegin;
252:   if (a) *a = NULL;
253:   PetscFunctionReturn(PETSC_SUCCESS);
254: }

256: static PetscErrorCode MatDenseHIPGetArray_SeqDenseHIP(Mat A, PetscScalar **a)
257: {
258:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

260:   PetscFunctionBegin;
261:   PetscCall(MatSeqDenseHIPCopyToGPU(A));
262:   *a = dA->d_v;
263:   PetscFunctionReturn(PETSC_SUCCESS);
264: }

266: static PetscErrorCode MatDenseHIPRestoreArray_SeqDenseHIP(Mat A, PetscScalar **a)
267: {
268:   PetscFunctionBegin;
269:   if (a) *a = NULL;
270:   PetscFunctionReturn(PETSC_SUCCESS);
271: }

273: PETSC_EXTERN PetscErrorCode MatSeqDenseHIPInvertFactors_Private(Mat A)
274: {
275:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
276:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
277:   PetscScalar      *da;
278:   hipsolverHandle_t handle;
279:   PetscHipBLASInt   n, lda;
280: #if defined(PETSC_USE_DEBUG)
281:   PetscHipBLASInt info;
282: #endif

284:   PetscFunctionBegin;
285:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
286:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
287:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
288:   PetscCall(PetscHipBLASIntCast(a->lda, &lda));
289:   PetscCheck(A->factortype != MAT_FACTOR_LU, PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDngetri not implemented");
290:   if (A->factortype == MAT_FACTOR_CHOLESKY) {
291:     if (!dA->d_fact_ipiv) { /* spd */
292:       PetscHipBLASInt il;

294:       PetscCall(MatDenseHIPGetArray(A, &da));
295:       PetscCallHIPSOLVER(hipsolverDnXpotri_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &il));
296:       if (il > dA->fact_lwork) {
297:         dA->fact_lwork = il;

299:         PetscCallHIP(hipFree(dA->d_fact_work));
300:         PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
301:       }
302:       PetscCall(PetscLogGpuTimeBegin());
303:       PetscCallHIPSOLVER(hipsolverDnXpotri(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
304:       PetscCall(PetscLogGpuTimeEnd());
305:       PetscCall(MatDenseHIPRestoreArray(A, &da));
306:       /* TODO (write hip kernel) */
307:       PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
308:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytri not implemented");
309:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
310: #if defined(PETSC_USE_DEBUG)
311:   PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
312:   PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: leading minor of order %d is zero", info);
313:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
314: #endif
315:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
316:   A->ops->solve          = NULL;
317:   A->ops->solvetranspose = NULL;
318:   A->ops->matsolve       = NULL;
319:   A->factortype          = MAT_FACTOR_NONE;

321:   PetscCall(PetscFree(A->solvertype));
322:   PetscFunctionReturn(PETSC_SUCCESS);
323: }

325: static PetscErrorCode MatSolve_SeqDenseHIP_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
326: {
327:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
328:   PetscScalar     *y;
329:   PetscHipBLASInt  m = 0, k = 0;
330:   PetscBool        xiship, yiship, aiship;

332:   PetscFunctionBegin;
333:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
334:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
335:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
336:   PetscCall(PetscObjectTypeCompare((PetscObject)xx, VECSEQHIP, &xiship));
337:   PetscCall(PetscObjectTypeCompare((PetscObject)yy, VECSEQHIP, &yiship));
338:   {
339:     const PetscScalar *x;
340:     PetscBool          xishost = PETSC_TRUE;

342:     /* The logic here is to try to minimize the amount of memory copying:
343:        if we call VecHIPGetArrayRead(X,&x) every time xiship and the
344:        data is not offloaded to the GPU yet, then the data is copied to the
345:        GPU.  But we are only trying to get the data in order to copy it into the y
346:        array.  So the array x will be wherever the data already is so that
347:        only one memcpy is performed */
348:     if (xiship && xx->offloadmask & PETSC_OFFLOAD_GPU) {
349:       PetscCall(VecHIPGetArrayRead(xx, &x));
350:       xishost = PETSC_FALSE;
351:     } else PetscCall(VecGetArrayRead(xx, &x));
352:     if (k < m || !yiship) {
353:       if (!dA->workvec) PetscCall(VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
354:       PetscCall(VecHIPGetArrayWrite(dA->workvec, &y));
355:     } else PetscCall(VecHIPGetArrayWrite(yy, &y));
356:     PetscCallHIP(hipMemcpy(y, x, m * sizeof(PetscScalar), xishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice));
357:   }
358:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship));
359:   if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
360:   PetscCall((*matsolve)(A, y, m, m, 1, k, transpose));
361:   if (!aiship) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
362:   if (k < m || !yiship) {
363:     PetscScalar *yv;

365:     /* The logic here is that the data is not yet in either yy's GPU array or its
366:        CPU array.  There is nothing in the interface to say where the user would like
367:        it to end up.  So we choose the GPU, because it is the faster option */
368:     if (yiship) PetscCall(VecHIPGetArrayWrite(yy, &yv));
369:     else PetscCall(VecGetArray(yy, &yv));
370:     PetscCallHIP(hipMemcpy(yv, y, k * sizeof(PetscScalar), yiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
371:     if (yiship) PetscCall(VecHIPRestoreArrayWrite(yy, &yv));
372:     else PetscCall(VecRestoreArray(yy, &yv));
373:     PetscCall(VecHIPRestoreArrayWrite(dA->workvec, &y));
374:   } else PetscCall(VecHIPRestoreArrayWrite(yy, &y));
375:   PetscFunctionReturn(PETSC_SUCCESS);
376: }

378: static PetscErrorCode MatMatSolve_SeqDenseHIP_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscHipBLASInt, PetscBool))
379: {
380:   PetscScalar    *y;
381:   PetscInt        n, _ldb, _ldx;
382:   PetscBool       biship, xiship, aiship;
383:   PetscHipBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;

385:   PetscFunctionBegin;
386:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
387:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
388:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
389:   PetscCall(MatGetSize(B, NULL, &n));
390:   PetscCall(PetscHipBLASIntCast(n, &nrhs));
391:   PetscCall(MatDenseGetLDA(B, &_ldb));
392:   PetscCall(PetscHipBLASIntCast(_ldb, &ldb));
393:   PetscCall(MatDenseGetLDA(X, &_ldx));
394:   PetscCall(PetscHipBLASIntCast(_ldx, &ldx));
395:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
396:   PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSEHIP, &xiship));
397:   /* The logic here is to try to minimize the amount of memory copying:
398:      if we call MatDenseHIPGetArrayRead(B,&b) every time biship and the
399:      data is not offloaded to the GPU yet, then the data is copied to the
400:      GPU.  But we are only trying to get the data in order to copy it into the y
401:      array.  So the array b will be wherever the data already is so that
402:      only one memcpy is performed */
403:   const PetscScalar *b;
404:   /* some copying from B will be involved */
405:   PetscBool bishost = PETSC_TRUE;
406:   if (biship && B->offloadmask & PETSC_OFFLOAD_GPU) {
407:     PetscCall(MatDenseHIPGetArrayRead(B, &b));
408:     bishost = PETSC_FALSE;
409:   } else PetscCall(MatDenseGetArrayRead(B, &b));
410:   if (ldx < m || !xiship) {
411:     /* X's array cannot serve as the array (too small or not on device), B's
412:      * array cannot serve as the array (const), so allocate a new array  */
413:     ldy = m;
414:     PetscCallHIP(hipMalloc((void **)&y, nrhs * m * sizeof(PetscScalar)));
415:   } else {
416:     /* X's array should serve as the array */
417:     ldy = ldx;
418:     PetscCall(MatDenseHIPGetArrayWrite(X, &y));
419:   }
420:   PetscCallHIP(hipMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? hipMemcpyHostToDevice : hipMemcpyDeviceToDevice));
421:   if (bishost) PetscCall(MatDenseRestoreArrayRead(B, &b));
422:   else PetscCall(MatDenseHIPRestoreArrayRead(B, &b));

424:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &aiship));
425:   if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
426:   PetscCall((*matsolve)(A, y, ldy, m, nrhs, k, transpose));
427:   if (!aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
428:   if (ldx < m || !xiship) {
429:     PetscScalar *x;

431:     /* The logic here is that the data is not yet in either X's GPU array or its
432:        CPU array.  There is nothing in the interface to say where the user would like
433:        it to end up.  So we choose the GPU, because it is the faster option */
434:     if (xiship) PetscCall(MatDenseHIPGetArrayWrite(X, &x));
435:     else PetscCall(MatDenseGetArray(X, &x));
436:     PetscCallHIP(hipMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiship ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
437:     if (xiship) PetscCall(MatDenseHIPRestoreArrayWrite(X, &x));
438:     else PetscCall(MatDenseRestoreArray(X, &x));
439:     PetscCallHIP(hipFree(y));
440:   } else PetscCall(MatDenseHIPRestoreArrayWrite(X, &y));
441:   PetscFunctionReturn(PETSC_SUCCESS);
442: }

444: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_LU(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
445: {
446:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
447:   Mat_SeqDenseHIP   *dA  = (Mat_SeqDenseHIP *)A->spptr;
448:   const PetscScalar *da;
449:   PetscHipBLASInt    lda;
450:   hipsolverHandle_t  handle;
451:   int                info;

453:   PetscFunctionBegin;
454:   PetscCall(MatDenseHIPGetArrayRead(A, &da));
455:   PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
456:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
457:   PetscCall(PetscLogGpuTimeBegin());
458:   PetscCall(PetscInfo(A, "LU solve %d x %d on backend\n", m, k));
459:   PetscCallHIPSOLVER(hipsolverDnXgetrs(handle, T ? HIPSOLVER_OP_T : HIPSOLVER_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
460:   PetscCall(PetscLogGpuTimeEnd());
461:   PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
462:   if (PetscDefined(USE_DEBUG)) {
463:     PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
464:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
465:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
466:   }
467:   PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
468:   PetscFunctionReturn(PETSC_SUCCESS);
469: }

471: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_Cholesky(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
472: {
473:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
474:   Mat_SeqDenseHIP   *dA  = (Mat_SeqDenseHIP *)A->spptr;
475:   const PetscScalar *da;
476:   PetscHipBLASInt    lda;
477:   hipsolverHandle_t  handle;
478:   int                info;

480:   PetscFunctionBegin;
481:   PetscCall(MatDenseHIPGetArrayRead(A, &da));
482:   PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
483:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
484:   PetscCall(PetscLogGpuTimeBegin());
485:   PetscCall(PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k));
486:   if (!dA->d_fact_ipiv) { /* spd */
487:     /* ========= Program hit hipErrorNotReady (error 34) due to "device not ready" on HIP API call to hipEventQuery. */
488:     PetscCallHIPSOLVER(hipsolverDnXpotrs(handle, HIPSOLVER_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
489:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "hipsolverDnsytrs not implemented");
490:   PetscCall(PetscLogGpuTimeEnd());
491:   PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
492:   if (PetscDefined(USE_DEBUG)) {
493:     PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
494:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
495:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
496:   }
497:   PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
498:   PetscFunctionReturn(PETSC_SUCCESS);
499: }

501: static PetscErrorCode MatSolve_SeqDenseHIP_Internal_QR(Mat A, PetscScalar *x, PetscHipBLASInt ldx, PetscHipBLASInt m, PetscHipBLASInt nrhs, PetscHipBLASInt k, PetscBool T)
502: {
503:   Mat_SeqDense        *mat = (Mat_SeqDense *)A->data;
504:   Mat_SeqDenseHIP     *dA  = (Mat_SeqDenseHIP *)A->spptr;
505:   const PetscScalar   *da;
506:   PetscHipBLASInt      lda, rank;
507:   hipsolverHandle_t    handle;
508:   hipblasHandle_t      bhandle;
509:   int                  info;
510:   hipsolverOperation_t trans;
511:   PetscScalar          one = 1.;

513:   PetscFunctionBegin;
514:   PetscCall(PetscHipBLASIntCast(mat->rank, &rank));
515:   PetscCall(MatDenseHIPGetArrayRead(A, &da));
516:   PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
517:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
518:   PetscCall(PetscHIPBLASGetHandle(&bhandle));
519:   PetscCall(PetscLogGpuTimeBegin());
520:   PetscCall(PetscInfo(A, "QR solve %d x %d on backend\n", m, k));
521:   if (!T) {
522:     if (PetscDefined(USE_COMPLEX)) trans = HIPSOLVER_OP_C;
523:     else trans = HIPSOLVER_OP_T;
524:     PetscCallHIPSOLVER(hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
525:     if (PetscDefined(USE_DEBUG)) {
526:       PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
527:       PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
528:     }
529:     PetscCallHIPBLAS(hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_N, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
530:   } else {
531:     PetscCallHIPBLAS(hipblasXtrsm(bhandle, HIPBLAS_SIDE_LEFT, HIPBLAS_FILL_MODE_UPPER, HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
532:     PetscCallHIPSOLVER(hipsolverDnXormqr(handle, HIPSOLVER_SIDE_LEFT, HIPSOLVER_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
533:     if (PetscDefined(USE_DEBUG)) {
534:       PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
535:       PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
536:     }
537:   }
538:   PetscCall(PetscLogGpuTimeEnd());
539:   PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
540:   PetscCall(PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank))));
541:   PetscFunctionReturn(PETSC_SUCCESS);
542: }

544: static PetscErrorCode MatSolve_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
545: {
546:   PetscFunctionBegin;
547:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU));
548:   PetscFunctionReturn(PETSC_SUCCESS);
549: }

551: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_LU(Mat A, Vec xx, Vec yy)
552: {
553:   PetscFunctionBegin;
554:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU));
555:   PetscFunctionReturn(PETSC_SUCCESS);
556: }

558: static PetscErrorCode MatSolve_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
559: {
560:   PetscFunctionBegin;
561:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky));
562:   PetscFunctionReturn(PETSC_SUCCESS);
563: }

565: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Vec xx, Vec yy)
566: {
567:   PetscFunctionBegin;
568:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky));
569:   PetscFunctionReturn(PETSC_SUCCESS);
570: }

572: static PetscErrorCode MatSolve_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
573: {
574:   PetscFunctionBegin;
575:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR));
576:   PetscFunctionReturn(PETSC_SUCCESS);
577: }

579: static PetscErrorCode MatSolveTranspose_SeqDenseHIP_QR(Mat A, Vec xx, Vec yy)
580: {
581:   PetscFunctionBegin;
582:   PetscCall(MatSolve_SeqDenseHIP_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR));
583:   PetscFunctionReturn(PETSC_SUCCESS);
584: }

586: static PetscErrorCode MatMatSolve_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
587: {
588:   PetscFunctionBegin;
589:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_LU));
590:   PetscFunctionReturn(PETSC_SUCCESS);
591: }

593: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_LU(Mat A, Mat B, Mat X)
594: {
595:   PetscFunctionBegin;
596:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_LU));
597:   PetscFunctionReturn(PETSC_SUCCESS);
598: }

600: static PetscErrorCode MatMatSolve_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
601: {
602:   PetscFunctionBegin;
603:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_Cholesky));
604:   PetscFunctionReturn(PETSC_SUCCESS);
605: }

607: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_Cholesky(Mat A, Mat B, Mat X)
608: {
609:   PetscFunctionBegin;
610:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_Cholesky));
611:   PetscFunctionReturn(PETSC_SUCCESS);
612: }

614: static PetscErrorCode MatMatSolve_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
615: {
616:   PetscFunctionBegin;
617:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseHIP_Internal_QR));
618:   PetscFunctionReturn(PETSC_SUCCESS);
619: }

621: static PetscErrorCode MatMatSolveTranspose_SeqDenseHIP_QR(Mat A, Mat B, Mat X)
622: {
623:   PetscFunctionBegin;
624:   PetscCall(MatMatSolve_SeqDenseHIP_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseHIP_Internal_QR));
625:   PetscFunctionReturn(PETSC_SUCCESS);
626: }

628: static PetscErrorCode MatLUFactor_SeqDenseHIP(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
629: {
630:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
631:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
632:   PetscScalar      *da;
633:   PetscHipBLASInt   m, n, lda;
634:   hipsolverHandle_t handle;
635: #if defined(PETSC_USE_DEBUG)
636:   int info;
637: #endif

639:   PetscFunctionBegin;
640:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
641:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
642:   PetscCall(MatDenseHIPGetArray(A, &da));
643:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
644:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
645:   PetscCall(PetscHipBLASIntCast(a->lda, &lda));
646:   PetscCall(PetscInfo(A, "LU factor %d x %d on backend\n", m, n));
647:   if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
648:   if (!dA->fact_lwork) {
649:     PetscCallHIPSOLVER(hipsolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
650:     PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
651:   }
652:   if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
653:   PetscCall(PetscLogGpuTimeBegin());
654:   PetscCallHIPSOLVER(hipsolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_ipiv, dA->d_fact_info));
655:   PetscCall(PetscLogGpuTimeEnd());
656:   PetscCall(MatDenseHIPRestoreArray(A, &da));
657: #if defined(PETSC_USE_DEBUG)
658:   PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
659:   PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
660:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
661: #endif
662:   A->factortype = MAT_FACTOR_LU;
663:   PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));

665:   A->ops->solve             = MatSolve_SeqDenseHIP_LU;
666:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_LU;
667:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_LU;
668:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_LU;

670:   PetscCall(PetscFree(A->solvertype));
671:   PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
672:   PetscFunctionReturn(PETSC_SUCCESS);
673: }

675: static PetscErrorCode MatCholeskyFactor_SeqDenseHIP(Mat A, IS perm, const MatFactorInfo *factinfo)
676: {
677:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
678:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
679:   PetscScalar      *da;
680:   PetscHipBLASInt   n, lda;
681:   hipsolverHandle_t handle;
682: #if defined(PETSC_USE_DEBUG)
683:   int info;
684: #endif

686:   PetscFunctionBegin;
687:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
688:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
689:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &n));
690:   PetscCall(PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n));
691:   if (A->spd == PETSC_BOOL3_TRUE) {
692:     PetscCall(MatDenseHIPGetArray(A, &da));
693:     PetscCall(PetscHipBLASIntCast(a->lda, &lda));
694:     if (!dA->fact_lwork) {
695:       PetscCallHIPSOLVER(hipsolverDnXpotrf_bufferSize(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork));
696:       PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
697:     }
698:     if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
699:     PetscCall(PetscLogGpuTimeBegin());
700:     PetscCallHIPSOLVER(hipsolverDnXpotrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
701:     PetscCall(PetscLogGpuTimeEnd());

703:     PetscCall(MatDenseHIPRestoreArray(A, &da));
704: #if defined(PETSC_USE_DEBUG)
705:     PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
706:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
707:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
708: #endif
709:     A->factortype = MAT_FACTOR_CHOLESKY;
710:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
711:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipsolverDnsytrs unavailable. Use MAT_FACTOR_LU");

713:   /* at the time of writing hipsolverDn has *sytrs and *hetr* routines implemented and the
714:        code below should work */
715:   if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
716:   if (!dA->fact_lwork) {
717:     PetscCallHIPSOLVER(hipsolverDnXsytrf_bufferSize(handle, n, da, lda, &dA->fact_lwork));
718:     PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
719:   }
720:   if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
721:   PetscCall(PetscLogGpuTimeBegin());
722:   PetscCallHIPSOLVER(hipsolverDnXsytrf(handle, HIPSOLVER_FILL_MODE_LOWER, n, da, lda, dA->d_fact_ipiv, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
723:   PetscCall(PetscLogGpuTimeEnd());

725:   A->ops->solve             = MatSolve_SeqDenseHIP_Cholesky;
726:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_Cholesky;
727:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_Cholesky;
728:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_Cholesky;
729:   PetscCall(PetscFree(A->solvertype));
730:   PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
731:   PetscFunctionReturn(PETSC_SUCCESS);
732: }

734: static PetscErrorCode MatQRFactor_SeqDenseHIP(Mat A, IS col, const MatFactorInfo *factinfo)
735: {
736:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
737:   Mat_SeqDenseHIP  *dA = (Mat_SeqDenseHIP *)A->spptr;
738:   PetscScalar      *da;
739:   PetscHipBLASInt   m, min, max, n, lda;
740:   hipsolverHandle_t handle;
741: #if defined(PETSC_USE_DEBUG)
742:   int info;
743: #endif

745:   PetscFunctionBegin;
746:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
747:   PetscCall(PetscHIPSOLVERGetHandle(&handle));
748:   PetscCall(MatDenseHIPGetArray(A, &da));
749:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
750:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
751:   PetscCall(PetscHipBLASIntCast(a->lda, &lda));
752:   PetscCall(PetscInfo(A, "QR factor %d x %d on backend\n", m, n));
753:   max = PetscMax(m, n);
754:   min = PetscMin(m, n);
755:   if (!dA->d_fact_tau) PetscCallHIP(hipMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau)));
756:   if (!dA->d_fact_ipiv) PetscCallHIP(hipMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
757:   if (!dA->fact_lwork) {
758:     PetscCallHIPSOLVER(hipsolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
759:     PetscCallHIP(hipMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
760:   }
761:   if (!dA->d_fact_info) PetscCallHIP(hipMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
762:   if (!dA->workvec) PetscCall(VecCreateSeqHIP(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
763:   PetscCall(PetscLogGpuTimeBegin());
764:   PetscCallHIPSOLVER(hipsolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
765:   PetscCall(PetscLogGpuTimeEnd());
766:   PetscCall(MatDenseHIPRestoreArray(A, &da));
767: #if defined(PETSC_USE_DEBUG)
768:   PetscCallHIP(hipMemcpy(&info, dA->d_fact_info, sizeof(PetscHipBLASInt), hipMemcpyDeviceToHost));
769:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to hipSolver %d", -info);
770: #endif
771:   A->factortype = MAT_FACTOR_QR;
772:   a->rank       = min;
773:   PetscCall(PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0)));

775:   A->ops->solve             = MatSolve_SeqDenseHIP_QR;
776:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseHIP_QR;
777:   A->ops->matsolve          = MatMatSolve_SeqDenseHIP_QR;
778:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseHIP_QR;

780:   PetscCall(PetscFree(A->solvertype));
781:   PetscCall(PetscStrallocpy(MATSOLVERHIP, &A->solvertype));
782:   PetscFunctionReturn(PETSC_SUCCESS);
783: }

785: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
786: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
787: {
788:   const PetscScalar *da, *db;
789:   PetscScalar       *dc;
790:   PetscScalar        one = 1.0, zero = 0.0;
791:   PetscHipBLASInt    m, n, k;
792:   PetscInt           alda, blda, clda;
793:   hipblasHandle_t    hipblasv2handle;
794:   PetscBool          Aiship, Biship;

796:   PetscFunctionBegin;
797:   /* we may end up with SEQDENSE as one of the arguments */
798:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSEHIP, &Aiship));
799:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &Biship));
800:   if (!Aiship) PetscCall(MatConvert(A, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &A));
801:   if (!Biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
802:   PetscCall(PetscHipBLASIntCast(C->rmap->n, &m));
803:   PetscCall(PetscHipBLASIntCast(C->cmap->n, &n));
804:   if (tA) PetscCall(PetscHipBLASIntCast(A->rmap->n, &k));
805:   else PetscCall(PetscHipBLASIntCast(A->cmap->n, &k));
806:   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
807:   PetscCall(PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n));
808:   PetscCall(MatDenseHIPGetArrayRead(A, &da));
809:   PetscCall(MatDenseHIPGetArrayRead(B, &db));
810:   PetscCall(MatDenseHIPGetArrayWrite(C, &dc));
811:   PetscCall(MatDenseGetLDA(A, &alda));
812:   PetscCall(MatDenseGetLDA(B, &blda));
813:   PetscCall(MatDenseGetLDA(C, &clda));
814:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
815:   PetscCall(PetscLogGpuTimeBegin());
816:   PetscCallHIPBLAS(hipblasXgemm(hipblasv2handle, tA ? HIPBLAS_OP_T : HIPBLAS_OP_N, tB ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda));
817:   PetscCall(PetscLogGpuTimeEnd());
818:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
819:   PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
820:   PetscCall(MatDenseHIPRestoreArrayRead(B, &db));
821:   PetscCall(MatDenseHIPRestoreArrayWrite(C, &dc));
822:   if (!Aiship) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
823:   if (!Biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
824:   PetscFunctionReturn(PETSC_SUCCESS);
825: }

827: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
828: {
829:   PetscFunctionBegin;
830:   PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_TRUE, PETSC_FALSE));
831:   PetscFunctionReturn(PETSC_SUCCESS);
832: }

834: PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
835: {
836:   PetscFunctionBegin;
837:   PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_FALSE));
838:   PetscFunctionReturn(PETSC_SUCCESS);
839: }

841: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP(Mat A, Mat B, Mat C)
842: {
843:   PetscFunctionBegin;
844:   PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(A, B, C, PETSC_FALSE, PETSC_TRUE));
845:   PetscFunctionReturn(PETSC_SUCCESS);
846: }

848: PetscErrorCode MatProductSetFromOptions_SeqDenseHIP(Mat C)
849: {
850:   PetscFunctionBegin;
851:   PetscCall(MatProductSetFromOptions_SeqDense(C));
852:   PetscFunctionReturn(PETSC_SUCCESS);
853: }

855: /* zz = op(A)*xx + yy
856:    if yy == NULL, only MatMult */
857: static PetscErrorCode MatMultAdd_SeqDenseHIP_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
858: {
859:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
860:   const PetscScalar *xarray, *da;
861:   PetscScalar       *zarray;
862:   PetscScalar        one = 1.0, zero = 0.0;
863:   PetscHipBLASInt    m, n, lda;
864:   hipblasHandle_t    hipblasv2handle;

866:   PetscFunctionBegin;
867:   if (yy && yy != zz) PetscCall(VecSeq_HIP::copy(yy, zz)); /* mult add */
868:   if (!A->rmap->n || !A->cmap->n) {
869:     if (!yy) PetscCall(VecSeq_HIP::set(zz, 0.0)); /* mult only */
870:     PetscFunctionReturn(PETSC_SUCCESS);
871:   }
872:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n));
873:   PetscCall(PetscHipBLASIntCast(A->rmap->n, &m));
874:   PetscCall(PetscHipBLASIntCast(A->cmap->n, &n));
875:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
876:   PetscCall(MatDenseHIPGetArrayRead(A, &da));
877:   PetscCall(PetscHipBLASIntCast(mat->lda, &lda));
878:   PetscCall(VecHIPGetArrayRead(xx, &xarray));
879:   PetscCall(VecHIPGetArray(zz, &zarray));
880:   PetscCall(PetscLogGpuTimeBegin());
881:   PetscCallHIPBLAS(hipblasXgemv(hipblasv2handle, trans ? HIPBLAS_OP_T : HIPBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1));
882:   PetscCall(PetscLogGpuTimeEnd());
883:   PetscCall(PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n)));
884:   PetscCall(VecHIPRestoreArrayRead(xx, &xarray));
885:   PetscCall(VecHIPRestoreArray(zz, &zarray));
886:   PetscCall(MatDenseHIPRestoreArrayRead(A, &da));
887:   PetscFunctionReturn(PETSC_SUCCESS);
888: }

890: PetscErrorCode MatMultAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
891: {
892:   PetscFunctionBegin;
893:   PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_FALSE));
894:   PetscFunctionReturn(PETSC_SUCCESS);
895: }

897: PetscErrorCode MatMultTransposeAdd_SeqDenseHIP(Mat A, Vec xx, Vec yy, Vec zz)
898: {
899:   PetscFunctionBegin;
900:   PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, yy, zz, PETSC_TRUE));
901:   PetscFunctionReturn(PETSC_SUCCESS);
902: }

904: PetscErrorCode MatMult_SeqDenseHIP(Mat A, Vec xx, Vec yy)
905: {
906:   PetscFunctionBegin;
907:   PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_FALSE));
908:   PetscFunctionReturn(PETSC_SUCCESS);
909: }

911: PetscErrorCode MatMultTranspose_SeqDenseHIP(Mat A, Vec xx, Vec yy)
912: {
913:   PetscFunctionBegin;
914:   PetscCall(MatMultAdd_SeqDenseHIP_Private(A, xx, NULL, yy, PETSC_TRUE));
915:   PetscFunctionReturn(PETSC_SUCCESS);
916: }

918: static PetscErrorCode MatDenseGetArrayRead_SeqDenseHIP(Mat A, const PetscScalar **array)
919: {
920:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

922:   PetscFunctionBegin;
923:   PetscCall(MatSeqDenseHIPCopyFromGPU(A));
924:   *array = mat->v;
925:   PetscFunctionReturn(PETSC_SUCCESS);
926: }

928: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseHIP(Mat A, PetscScalar **array)
929: {
930:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

932:   PetscFunctionBegin;
933:   if (!mat->v) PetscCall(MatSeqDenseSetPreallocation(A, NULL)); /* MatCreateSeqDenseHIP may not allocate CPU memory. Allocate if needed */
934:   *array         = mat->v;
935:   A->offloadmask = PETSC_OFFLOAD_CPU;
936:   PetscFunctionReturn(PETSC_SUCCESS);
937: }

939: static PetscErrorCode MatDenseGetArray_SeqDenseHIP(Mat A, PetscScalar **array)
940: {
941:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

943:   PetscFunctionBegin;
944:   PetscCall(MatSeqDenseHIPCopyFromGPU(A));
945:   *array         = mat->v;
946:   A->offloadmask = PETSC_OFFLOAD_CPU;
947:   PetscFunctionReturn(PETSC_SUCCESS);
948: }

950: static PetscErrorCode MatDenseGetArrayAndMemType_SeqDenseHIP(Mat A, PetscScalar **array, PetscMemType *mtype)
951: {
952:   const auto dA = static_cast<Mat_SeqDenseHIP *>(A->spptr);

954:   PetscFunctionBegin;
955:   PetscCall(MatSeqDenseHIPCopyToGPU(A)); // Since we will read the array on device, we sync the GPU data if necessary
956:   *array = dA->d_v;
957:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
958:   PetscFunctionReturn(PETSC_SUCCESS);
959: }

961: static PetscErrorCode MatDenseRestoreArrayAndMemType_SeqDenseHIP(Mat A, PetscScalar **array)
962: {
963:   PetscFunctionBegin;
964:   *array         = nullptr;
965:   A->offloadmask = PETSC_OFFLOAD_GPU; // Since we've written to the array on device
966:   PetscFunctionReturn(PETSC_SUCCESS);
967: }

969: static PetscErrorCode (*MatDenseGetArrayReadAndMemType_SeqDenseHIP)(Mat, PetscScalar **, PetscMemType *) = MatDenseGetArrayAndMemType_SeqDenseHIP;
970: static PetscErrorCode (*MatDenseRestoreArrayReadAndMemType_SeqDenseHIP)(Mat, PetscScalar **)             = nullptr; // Keep the offload mask as is

972: static PetscErrorCode MatDenseGetArrayWriteAndMemType_SeqDenseHIP(Mat A, PetscScalar **array, PetscMemType *mtype)
973: {
974:   const auto dA = static_cast<Mat_SeqDenseHIP *>(A->spptr);

976:   PetscFunctionBegin;
977:   if (!dA->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL)); // Allocate GPU memory if not present
978:   *array = dA->d_v;
979:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
980:   PetscFunctionReturn(PETSC_SUCCESS);
981: }

983: static PetscErrorCode (*MatDenseRestoreArrayWriteAndMemType_SeqDenseHIP)(Mat, PetscScalar **) = MatDenseRestoreArrayAndMemType_SeqDenseHIP; // Since we've written to the array on device

985: PetscErrorCode MatScale_SeqDenseHIP(Mat Y, PetscScalar alpha)
986: {
987:   Mat_SeqDense   *y = (Mat_SeqDense *)Y->data;
988:   PetscScalar    *dy;
989:   PetscHipBLASInt j, N, m, lday, one = 1;
990:   hipblasHandle_t hipblasv2handle;

992:   PetscFunctionBegin;
993:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
994:   PetscCall(MatDenseHIPGetArray(Y, &dy));
995:   PetscCall(PetscHipBLASIntCast(Y->rmap->n * Y->cmap->n, &N));
996:   PetscCall(PetscHipBLASIntCast(Y->rmap->n, &m));
997:   PetscCall(PetscHipBLASIntCast(y->lda, &lday));
998:   PetscCall(PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
999:   PetscCall(PetscLogGpuTimeBegin());
1000:   if (lday > m) {
1001:     for (j = 0; j < Y->cmap->n; j++) PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, m, &alpha, dy + lday * j, one));
1002:   } else PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, N, &alpha, dy, one));
1003:   PetscCall(PetscLogGpuTimeEnd());
1004:   PetscCall(PetscLogGpuFlops(N));
1005:   PetscCall(MatDenseHIPRestoreArray(Y, &dy));
1006:   PetscFunctionReturn(PETSC_SUCCESS);
1007: }

1009: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1010:   const PetscScalar shift_;
1011:   petscshift(PetscScalar shift) : shift_(shift) { }
1012:   __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1013: };

1015: template <typename Iterator>
1016: class strided_range {
1017: public:
1018:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1019:   struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1020:     difference_type stride;
1021:     stride_functor(difference_type stride) : stride(stride) { }
1022:     __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1023:   };
1024:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
1025:   typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1026:   typedef typename thrust::permutation_iterator<Iterator, TransformIterator>    PermutationIterator;
1027:   typedef PermutationIterator                                                   iterator; // type of the strided_range iterator
1028:   // construct strided_range for the range [first,last)
1029:   strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1030:   iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1031:   iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }

1033: protected:
1034:   Iterator        first;
1035:   Iterator        last;
1036:   difference_type stride;
1037: };

1039: PetscErrorCode MatShift_DenseHIP_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1040: {
1041:   PetscFunctionBegin;
1042:   PetscInt rend2 = PetscMin(rend, cols);
1043:   if (rend2 > rstart) {
1044:     PetscCall(PetscLogGpuTimeBegin());
1045:     try {
1046:       const auto                                                  dptr  = thrust::device_pointer_cast(da);
1047:       size_t                                                      begin = rstart * lda;
1048:       size_t                                                      end   = rend2 - rstart + rend2 * lda;
1049:       strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1050:       thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1051:     } catch (char *ex) {
1052:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1053:     }
1054:     PetscCall(PetscLogGpuTimeEnd());
1055:     PetscCall(PetscLogGpuFlops(rend2 - rstart));
1056:   }
1057:   PetscFunctionReturn(PETSC_SUCCESS);
1058: }

1060: PetscErrorCode MatShift_SeqDenseHIP(Mat A, PetscScalar alpha)
1061: {
1062:   PetscScalar *da;
1063:   PetscInt     m = A->rmap->n, n = A->cmap->n, lda;

1065:   PetscFunctionBegin;
1066:   PetscCall(MatDenseHIPGetArray(A, &da));
1067:   PetscCall(MatDenseGetLDA(A, &lda));
1068:   PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1069:   PetscCall(MatShift_DenseHIP_Private(da, alpha, lda, 0, m, n));
1070:   PetscCall(MatDenseHIPRestoreArray(A, &da));
1071:   PetscFunctionReturn(PETSC_SUCCESS);
1072: }

1074: PetscErrorCode MatAXPY_SeqDenseHIP(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1075: {
1076:   Mat_SeqDense      *x = (Mat_SeqDense *)X->data;
1077:   Mat_SeqDense      *y = (Mat_SeqDense *)Y->data;
1078:   const PetscScalar *dx;
1079:   PetscScalar       *dy;
1080:   PetscHipBLASInt    j, N, m, ldax, lday, one = 1;
1081:   hipblasHandle_t    hipblasv2handle;

1083:   PetscFunctionBegin;
1084:   if (!X->rmap->n || !X->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1085:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
1086:   PetscCall(MatDenseHIPGetArrayRead(X, &dx));
1087:   if (alpha == 0.0) PetscCall(MatDenseHIPGetArrayWrite(Y, &dy));
1088:   else PetscCall(MatDenseHIPGetArray(Y, &dy));
1089:   PetscCall(PetscHipBLASIntCast(X->rmap->n * X->cmap->n, &N));
1090:   PetscCall(PetscHipBLASIntCast(X->rmap->n, &m));
1091:   PetscCall(PetscHipBLASIntCast(x->lda, &ldax));
1092:   PetscCall(PetscHipBLASIntCast(y->lda, &lday));
1093:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1094:   PetscCall(PetscLogGpuTimeBegin());
1095:   if (ldax > m || lday > m) {
1096:     for (j = 0; j < X->cmap->n; j++) PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one));
1097:   } else PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, N, &alpha, dx, one, dy, one));
1098:   PetscCall(PetscLogGpuTimeEnd());
1099:   PetscCall(PetscLogGpuFlops(PetscMax(2. * N - 1, 0)));
1100:   PetscCall(MatDenseHIPRestoreArrayRead(X, &dx));
1101:   if (alpha == 0.0) PetscCall(MatDenseHIPRestoreArrayWrite(Y, &dy));
1102:   else PetscCall(MatDenseHIPRestoreArray(Y, &dy));
1103:   PetscFunctionReturn(PETSC_SUCCESS);
1104: }

1106: static PetscErrorCode MatReset_SeqDenseHIP(Mat A)
1107: {
1108:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

1110:   PetscFunctionBegin;
1111:   if (dA) {
1112:     PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseHIPResetArray() must be called first");
1113:     if (!dA->user_alloc) PetscCallHIP(hipFree(dA->d_v));
1114:     PetscCallHIP(hipFree(dA->d_fact_tau));
1115:     PetscCallHIP(hipFree(dA->d_fact_ipiv));
1116:     PetscCallHIP(hipFree(dA->d_fact_info));
1117:     PetscCallHIP(hipFree(dA->d_fact_work));
1118:     PetscCall(VecDestroy(&dA->workvec));
1119:   }
1120:   PetscCall(PetscFree(A->spptr));
1121:   PetscFunctionReturn(PETSC_SUCCESS);
1122: }

1124: PetscErrorCode MatDestroy_SeqDenseHIP(Mat A)
1125: {
1126:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1128:   PetscFunctionBegin;
1129:   /* prevent to copy back data if we own the data pointer */
1130:   if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1131:   PetscCall(MatConvert_SeqDenseHIP_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1132:   PetscCall(MatDestroy_SeqDense(A));
1133:   PetscFunctionReturn(PETSC_SUCCESS);
1134: }

1136: PetscErrorCode MatDuplicate_SeqDenseHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
1137: {
1138:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1140:   PetscFunctionBegin;
1141:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1142:   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1143:   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
1144:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues));
1145:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) PetscCall(MatCopy_SeqDenseHIP(A, *B, SAME_NONZERO_PATTERN));
1146:   if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1147:     Mat_SeqDenseHIP *dB = (Mat_SeqDenseHIP *)(*B)->spptr;
1148:     if (!dB->d_v) PetscCall(MatSeqDenseHIPSetPreallocation(*B, NULL));
1149:   }
1150:   PetscFunctionReturn(PETSC_SUCCESS);
1151: }

1153: static PetscErrorCode MatGetColumnVector_SeqDenseHIP(Mat A, Vec v, PetscInt col)
1154: {
1155:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
1156:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1157:   PetscScalar     *x;
1158:   PetscBool        viship;

1160:   PetscFunctionBegin;
1161:   PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viship, VECSEQHIP, VECMPIHIP, VECHIP, ""));
1162:   if (viship && !v->boundtocpu) { /* update device data */
1163:     PetscCall(VecHIPGetArrayWrite(v, &x));
1164:     if (A->offloadmask & PETSC_OFFLOAD_GPU) PetscCallHIP(hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToHost));
1165:     else PetscCallHIP(hipMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
1166:     PetscCall(VecHIPRestoreArrayWrite(v, &x));
1167:   } else { /* update host data */
1168:     PetscCall(VecGetArrayWrite(v, &x));
1169:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) PetscCall(PetscArraycpy(x, a->v + col * a->lda, A->rmap->n));
1170:     else if (A->offloadmask & PETSC_OFFLOAD_GPU) PetscCallHIP(hipMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1171:     PetscCall(VecRestoreArrayWrite(v, &x));
1172:   }
1173:   PetscFunctionReturn(PETSC_SUCCESS);
1174: }

1176: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_hip(Mat A, MatFactorType ftype, Mat *fact)
1177: {
1178:   PetscFunctionBegin;
1179:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), fact));
1180:   PetscCall(MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1181:   PetscCall(MatSetType(*fact, MATSEQDENSEHIP));
1182:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1183:     (*fact)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1184:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1185:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1186:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1187:   } else if (ftype == MAT_FACTOR_QR) {
1188:     PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense));
1189:     PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1190:   }
1191:   (*fact)->factortype = ftype;
1192:   PetscCall(PetscFree((*fact)->solvertype));
1193:   PetscCall(PetscStrallocpy(MATSOLVERHIP, &(*fact)->solvertype));
1194:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]));
1195:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]));
1196:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]));
1197:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]));
1198:   PetscFunctionReturn(PETSC_SUCCESS);
1199: }

1201: static PetscErrorCode MatDenseGetColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1202: {
1203:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1205:   PetscFunctionBegin;
1206:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1207:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1208:   PetscCall(MatDenseHIPGetArray(A, (PetscScalar **)&a->ptrinuse));
1209:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1210:     PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1211:   }
1212:   a->vecinuse = col + 1;
1213:   PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1214:   *v = a->cvec;
1215:   PetscFunctionReturn(PETSC_SUCCESS);
1216: }

1218: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1219: {
1220:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1222:   PetscFunctionBegin;
1223:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1224:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1225:   a->vecinuse = 0;
1226:   PetscCall(VecHIPResetArray(a->cvec));
1227:   PetscCall(MatDenseHIPRestoreArray(A, (PetscScalar **)&a->ptrinuse));
1228:   if (v) *v = NULL;
1229:   PetscFunctionReturn(PETSC_SUCCESS);
1230: }

1232: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1233: {
1234:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1236:   PetscFunctionBegin;
1237:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1238:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1239:   PetscCall(MatDenseHIPGetArrayRead(A, &a->ptrinuse));
1240:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1241:     PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1242:   }
1243:   a->vecinuse = col + 1;
1244:   PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1245:   PetscCall(VecLockReadPush(a->cvec));
1246:   *v = a->cvec;
1247:   PetscFunctionReturn(PETSC_SUCCESS);
1248: }

1250: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1251: {
1252:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1254:   PetscFunctionBegin;
1255:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1256:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1257:   a->vecinuse = 0;
1258:   PetscCall(VecLockReadPop(a->cvec));
1259:   PetscCall(VecHIPResetArray(a->cvec));
1260:   PetscCall(MatDenseHIPRestoreArrayRead(A, &a->ptrinuse));
1261:   if (v) *v = NULL;
1262:   PetscFunctionReturn(PETSC_SUCCESS);
1263: }

1265: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1266: {
1267:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1269:   PetscFunctionBegin;
1270:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1271:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1272:   PetscCall(MatDenseHIPGetArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1273:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecHIPPlaceArray is called */
1274:     PetscCall(VecCreateSeqHIPWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1275:   }
1276:   a->vecinuse = col + 1;
1277:   PetscCall(VecHIPPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1278:   *v = a->cvec;
1279:   PetscFunctionReturn(PETSC_SUCCESS);
1280: }

1282: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseHIP(Mat A, PetscInt col, Vec *v)
1283: {
1284:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1286:   PetscFunctionBegin;
1287:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1288:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1289:   a->vecinuse = 0;
1290:   PetscCall(VecHIPResetArray(a->cvec));
1291:   PetscCall(MatDenseHIPRestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1292:   if (v) *v = NULL;
1293:   PetscFunctionReturn(PETSC_SUCCESS);
1294: }

1296: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseHIP(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1297: {
1298:   Mat_SeqDense    *a  = (Mat_SeqDense *)A->data;
1299:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;

1301:   PetscFunctionBegin;
1302:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1303:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1304:   if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) PetscCall(MatDestroy(&a->cmat));
1305:   PetscCall(MatSeqDenseHIPCopyToGPU(A));
1306:   if (!a->cmat) {
1307:     PetscCall(MatCreateDenseHIP(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat));
1308:   } else PetscCall(MatDenseHIPPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda));
1309:   PetscCall(MatDenseSetLDA(a->cmat, a->lda));
1310:   /* Place CPU array if present but not copy any data */
1311:   a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1312:   if (a->v) { PetscCall(MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda)); }
1313:   a->cmat->offloadmask = A->offloadmask;
1314:   a->matinuse          = cbegin + 1;
1315:   *v                   = a->cmat;
1316:   PetscFunctionReturn(PETSC_SUCCESS);
1317: }

1319: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseHIP(Mat A, Mat *v)
1320: {
1321:   Mat_SeqDense    *a    = (Mat_SeqDense *)A->data;
1322:   PetscBool        copy = PETSC_FALSE, reset;
1323:   PetscOffloadMask suboff;

1325:   PetscFunctionBegin;
1326:   PetscCheck(a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1327:   PetscCheck(a->cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1328:   PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1329:   a->matinuse = 0;
1330:   reset       = a->v ? PETSC_TRUE : PETSC_FALSE;
1331:   suboff      = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1332:   if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1333:     copy = PETSC_TRUE;
1334:     PetscCall(MatSeqDenseSetPreallocation(A, NULL));
1335:   }
1336:   PetscCall(MatDenseHIPResetArray(a->cmat));
1337:   if (reset) PetscCall(MatDenseResetArray(a->cmat));
1338:   if (copy) {
1339:     PetscCall(MatSeqDenseHIPCopyFromGPU(A));
1340:   } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1341:   a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1342:   if (v) *v = NULL;
1343:   PetscFunctionReturn(PETSC_SUCCESS);
1344: }

1346: static PetscErrorCode MatDenseSetLDA_SeqDenseHIP(Mat A, PetscInt lda)
1347: {
1348:   Mat_SeqDense    *cA = (Mat_SeqDense *)A->data;
1349:   Mat_SeqDenseHIP *dA = (Mat_SeqDenseHIP *)A->spptr;
1350:   PetscBool        data;

1352:   PetscFunctionBegin;
1353:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1354:   PetscCheck(dA->user_alloc || data || cA->lda == lda, PETSC_COMM_SELF, PETSC_ERR_ORDER, "LDA cannot be changed after allocation of internal storage");
1355:   PetscCheck(lda >= A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "LDA %" PetscInt_FMT " must be at least matrix dimension %" PetscInt_FMT, lda, A->rmap->n);
1356:   cA->lda = lda;
1357:   PetscFunctionReturn(PETSC_SUCCESS);
1358: }

1360: static PetscErrorCode MatSetUp_SeqDenseHIP(Mat A)
1361: {
1362:   PetscFunctionBegin;
1363:   PetscCall(PetscLayoutSetUp(A->rmap));
1364:   PetscCall(PetscLayoutSetUp(A->cmap));
1365:   if (!A->preallocated) PetscCall(MatSeqDenseHIPSetPreallocation(A, NULL));
1366:   PetscFunctionReturn(PETSC_SUCCESS);
1367: }

1369: static PetscErrorCode MatBindToCPU_SeqDenseHIP(Mat A, PetscBool flg)
1370: {
1371:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1373:   PetscFunctionBegin;
1374:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1375:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1376:   A->boundtocpu = flg;
1377:   if (!flg) {
1378:     PetscBool iship;

1380:     PetscCall(PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQHIP, &iship));
1381:     if (!iship) PetscCall(VecDestroy(&a->cvec));
1382:     PetscCall(PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSEHIP, &iship));
1383:     if (!iship) PetscCall(MatDestroy(&a->cmat));
1384:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseHIP));
1385:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseHIP));
1386:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseHIP));
1387:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseHIP));
1388:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseHIP));
1389:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseHIP));
1390:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseHIP));
1391:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseHIP));
1392:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseHIP));
1393:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseHIP));
1394:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseHIP));
1395:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseHIP));
1396:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseHIP));

1398:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", MatDenseGetArrayAndMemType_SeqDenseHIP));
1399:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", MatDenseRestoreArrayAndMemType_SeqDenseHIP));
1400:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", MatDenseGetArrayReadAndMemType_SeqDenseHIP));
1401:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", MatDenseRestoreArrayReadAndMemType_SeqDenseHIP));
1402:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", MatDenseGetArrayWriteAndMemType_SeqDenseHIP));
1403:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", MatDenseRestoreArrayWriteAndMemType_SeqDenseHIP));

1405:     A->ops->duplicate               = MatDuplicate_SeqDenseHIP;
1406:     A->ops->mult                    = MatMult_SeqDenseHIP;
1407:     A->ops->multadd                 = MatMultAdd_SeqDenseHIP;
1408:     A->ops->multtranspose           = MatMultTranspose_SeqDenseHIP;
1409:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseHIP;
1410:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1411:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseHIP_SeqDenseHIP;
1412:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseHIP_SeqDenseHIP;
1413:     A->ops->axpy                    = MatAXPY_SeqDenseHIP;
1414:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseHIP;
1415:     A->ops->lufactor                = MatLUFactor_SeqDenseHIP;
1416:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseHIP;
1417:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseHIP;
1418:     A->ops->scale                   = MatScale_SeqDenseHIP;
1419:     A->ops->shift                   = MatShift_SeqDenseHIP;
1420:     A->ops->copy                    = MatCopy_SeqDenseHIP;
1421:     A->ops->zeroentries             = MatZeroEntries_SeqDenseHIP;
1422:     A->ops->setup                   = MatSetUp_SeqDenseHIP;
1423:   } else {
1424:     /* make sure we have an up-to-date copy on the CPU */
1425:     PetscCall(MatSeqDenseHIPCopyFromGPU(A));
1426:     PetscCall(PetscFree(A->defaultrandtype));
1427:     PetscCall(PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype));
1428:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense));
1429:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense));
1430:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense));
1431:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense));
1432:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense));
1433:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense));
1434:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense));
1435:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense));
1436:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense));
1437:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense));
1438:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense));
1439:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1440:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense));

1442:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", NULL));
1443:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", NULL));
1444:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", NULL));
1445:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", NULL));
1446:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", NULL));
1447:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", NULL));

1449:     A->ops->duplicate               = MatDuplicate_SeqDense;
1450:     A->ops->mult                    = MatMult_SeqDense;
1451:     A->ops->multadd                 = MatMultAdd_SeqDense;
1452:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1453:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1454:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1455:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1456:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1457:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1458:     A->ops->axpy                    = MatAXPY_SeqDense;
1459:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1460:     A->ops->lufactor                = MatLUFactor_SeqDense;
1461:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1462:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1463:     A->ops->scale                   = MatScale_SeqDense;
1464:     A->ops->shift                   = MatShift_SeqDense;
1465:     A->ops->copy                    = MatCopy_SeqDense;
1466:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1467:     A->ops->setup                   = MatSetUp_SeqDense;
1468:     A->ops->setrandom               = MatSetRandom_SeqDense;
1469:   }
1470:   if (a->cmat) PetscCall(MatBindToCPU(a->cmat, flg));
1471:   PetscFunctionReturn(PETSC_SUCCESS);
1472: }

1474: PetscErrorCode MatConvert_SeqDenseHIP_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1475: {
1476:   Mat           B;
1477:   Mat_SeqDense *a;

1479:   PetscFunctionBegin;
1480:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1481:     /* TODO these cases should be optimized */
1482:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1483:     PetscFunctionReturn(PETSC_SUCCESS);
1484:   }

1486:   B = *newmat;
1487:   PetscCall(MatBindToCPU_SeqDenseHIP(B, PETSC_TRUE));
1488:   PetscCall(MatReset_SeqDenseHIP(B));
1489:   PetscCall(PetscFree(B->defaultvectype));
1490:   PetscCall(PetscStrallocpy(VECSTANDARD, &B->defaultvectype));
1491:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE));
1492:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", NULL));
1493:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", NULL));
1494:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", NULL));
1495:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", NULL));
1496:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", NULL));
1497:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", NULL));
1498:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", NULL));
1499:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", NULL));
1500:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", NULL));
1501:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", NULL));
1502:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", NULL));
1503:   a = (Mat_SeqDense *)B->data;
1504:   PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQHIP. Destroy it and rebuild a VECSEQ when needed */
1505:   B->ops->bindtocpu = NULL;
1506:   B->ops->destroy   = MatDestroy_SeqDense;
1507:   B->offloadmask    = PETSC_OFFLOAD_CPU;
1508:   PetscFunctionReturn(PETSC_SUCCESS);
1509: }

1511: PetscErrorCode MatConvert_SeqDense_SeqDenseHIP(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1512: {
1513:   Mat_SeqDenseHIP *dB;
1514:   Mat_SeqDense    *a;
1515:   Mat              B;

1517:   PetscFunctionBegin;
1518:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
1519:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1520:     /* TODO these cases should be optimized */
1521:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1522:     PetscFunctionReturn(PETSC_SUCCESS);
1523:   }

1525:   B = *newmat;
1526:   PetscCall(PetscFree(B->defaultvectype));
1527:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
1528:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSEHIP));
1529:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensehip_seqdense_C", MatConvert_SeqDenseHIP_SeqDense));
1530:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArray_C", MatDenseHIPGetArray_SeqDenseHIP));
1531:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayRead_C", MatDenseHIPGetArrayRead_SeqDenseHIP));
1532:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPGetArrayWrite_C", MatDenseHIPGetArrayWrite_SeqDenseHIP));
1533:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArray_C", MatDenseHIPRestoreArray_SeqDenseHIP));
1534:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayRead_C", MatDenseHIPRestoreArrayRead_SeqDenseHIP));
1535:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPRestoreArrayWrite_C", MatDenseHIPRestoreArrayWrite_SeqDenseHIP));
1536:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPPlaceArray_C", MatDenseHIPPlaceArray_SeqDenseHIP));
1537:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPResetArray_C", MatDenseHIPResetArray_SeqDenseHIP));
1538:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseHIPReplaceArray_C", MatDenseHIPReplaceArray_SeqDenseHIP));
1539:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensehip_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
1540:   a = (Mat_SeqDense *)B->data;
1541:   PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQHIP when needed */
1542:   PetscCall(PetscNew(&dB));
1543:   B->spptr       = dB;
1544:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1545:   PetscCall(MatBindToCPU_SeqDenseHIP(B, PETSC_FALSE));
1546:   B->ops->bindtocpu = MatBindToCPU_SeqDenseHIP;
1547:   B->ops->destroy   = MatDestroy_SeqDenseHIP;
1548:   PetscFunctionReturn(PETSC_SUCCESS);
1549: }

1551: /*@C
1552:    MatCreateSeqDenseHIP - Creates a sequential matrix in dense format using HIP.

1554:    Collective

1556:    Input Parameters:
1557: +  comm - MPI communicator
1558: .  m - number of rows
1559: .  n - number of columns
1560: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1561:    to control matrix memory allocation.

1563:    Output Parameter:
1564: .  A - the matrix

1566:    Notes:

1568:    Level: intermediate

1570: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1571: @*/
1572: PetscErrorCode MatCreateSeqDenseHIP(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1573: {
1574:   PetscMPIInt size;

1576:   PetscFunctionBegin;
1577:   PetscCallMPI(MPI_Comm_size(comm, &size));
1578:   PetscCheck(size <= 1, comm, PETSC_ERR_ARG_WRONG, "Invalid communicator size %d", size);
1579:   PetscCall(MatCreate(comm, A));
1580:   PetscCall(MatSetSizes(*A, m, n, m, n));
1581:   PetscCall(MatSetType(*A, MATSEQDENSEHIP));
1582:   PetscCall(MatSeqDenseHIPSetPreallocation(*A, data));
1583:   PetscFunctionReturn(PETSC_SUCCESS);
1584: }

1586: /*MC
1587:    MATSEQDENSEHIP - MATSEQDENSEHIP = "seqdensehip" - A matrix type to be used for sequential dense matrices on GPUs.

1589:    Options Database Keys:
1590: . -mat_type seqdensehip - sets the matrix type to `MATSEQDENSEHIP` during a call to `MatSetFromOptions()`

1592:   Level: beginner

1594: .seealso: `MATSEQDENSE`
1595: M*/
1596: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseHIP(Mat B)
1597: {
1598:   PetscFunctionBegin;
1599:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
1600:   PetscCall(MatCreate_SeqDense(B));
1601:   PetscCall(MatConvert_SeqDense_SeqDenseHIP(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
1602:   PetscFunctionReturn(PETSC_SUCCESS);
1603: }