Actual source code: densecuda.cu

  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
  8: #include <petsc/private/petsclegacycupmblas.h>
  9: #include <thrust/device_ptr.h>
 10: #include <thrust/functional.h>
 11: #include <thrust/iterator/counting_iterator.h>
 12: #include <thrust/iterator/transform_iterator.h>
 13: #include <thrust/iterator/permutation_iterator.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/device_vector.h>

 17: using VecSeq_CUDA = Petsc::vec::cupm::impl::VecSeq_CUPM<Petsc::device::cupm::DeviceType::CUDA>;

 19: typedef struct {
 20:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 21:   PetscBool    user_alloc;
 22:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 23:   PetscBool    unplaced_user_alloc;
 24:   /* factorization support */
 25:   PetscCuBLASInt *d_fact_ipiv; /* device pivots */
 26:   PetscScalar    *d_fact_tau;  /* device QR tau vector */
 27:   PetscScalar    *d_fact_work; /* device workspace */
 28:   PetscCuBLASInt  fact_lwork;
 29:   PetscCuBLASInt *d_fact_info; /* device info */
 30:   /* workspace */
 31:   Vec workvec;
 32: } Mat_SeqDenseCUDA;

 34: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 35: {
 36:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
 37:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
 38:   PetscBool         iscuda;

 40:   PetscFunctionBegin;
 41:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &iscuda));
 42:   if (!iscuda) PetscFunctionReturn(PETSC_SUCCESS);
 43:   PetscCall(PetscLayoutSetUp(A->rmap));
 44:   PetscCall(PetscLayoutSetUp(A->cmap));
 45:   /* it may happen CPU preallocation has not been performed */
 46:   if (cA->lda <= 0) cA->lda = A->rmap->n;
 47:   if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
 48:   if (!d_data) { /* petsc-allocated storage */
 49:     size_t sz;

 51:     PetscCall(PetscIntMultError(cA->lda, A->cmap->n, NULL));
 52:     sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
 53:     PetscCallCUDA(cudaMalloc((void **)&dA->d_v, sz));
 54:     PetscCallCUDA(cudaMemset(dA->d_v, 0, sz));
 55:     dA->user_alloc = PETSC_FALSE;
 56:   } else { /* user-allocated storage */
 57:     dA->d_v        = d_data;
 58:     dA->user_alloc = PETSC_TRUE;
 59:   }
 60:   A->offloadmask  = PETSC_OFFLOAD_GPU;
 61:   A->preallocated = PETSC_TRUE;
 62:   A->assembled    = PETSC_TRUE;
 63:   PetscFunctionReturn(PETSC_SUCCESS);
 64: }

 66: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
 67: {
 68:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
 69:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

 71:   PetscFunctionBegin;
 72:   PetscCheckTypeName(A, MATSEQDENSECUDA);
 73:   PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
 74:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
 75:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
 76:       PetscCall(MatSeqDenseSetPreallocation(A, NULL));
 77:     }
 78:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0));
 79:     if (cA->lda > A->rmap->n) {
 80:       PetscCallCUDA(cudaMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyDeviceToHost));
 81:     } else {
 82:       PetscCallCUDA(cudaMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyDeviceToHost));
 83:     }
 84:     PetscCall(PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
 85:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0));

 87:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 88:   }
 89:   PetscFunctionReturn(PETSC_SUCCESS);
 90: }

 92: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
 93: {
 94:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
 95:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
 96:   PetscBool         copy;

 98:   PetscFunctionBegin;
 99:   PetscCheckTypeName(A, MATSEQDENSECUDA);
100:   if (A->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
101:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
102:   PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
103:   if (copy) {
104:     if (!dA->d_v) { /* Allocate GPU memory if not present */
105:       PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
106:     }
107:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0));
108:     if (cA->lda > A->rmap->n) {
109:       PetscCallCUDA(cudaMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyHostToDevice));
110:     } else {
111:       PetscCallCUDA(cudaMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyHostToDevice));
112:     }
113:     PetscCall(PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
114:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0));

116:     A->offloadmask = PETSC_OFFLOAD_BOTH;
117:   }
118:   PetscFunctionReturn(PETSC_SUCCESS);
119: }

121: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A, Mat B, MatStructure str)
122: {
123:   const PetscScalar *va;
124:   PetscScalar       *vb;
125:   PetscInt           lda1, lda2, m = A->rmap->n, n = A->cmap->n;

127:   PetscFunctionBegin;
128:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
129:   if (A->ops->copy != B->ops->copy) {
130:     PetscCall(MatCopy_Basic(A, B, str));
131:     PetscFunctionReturn(PETSC_SUCCESS);
132:   }
133:   PetscCheck(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
134:   PetscCall(MatDenseCUDAGetArrayRead(A, &va));
135:   PetscCall(MatDenseCUDAGetArrayWrite(B, &vb));
136:   PetscCall(MatDenseGetLDA(A, &lda1));
137:   PetscCall(MatDenseGetLDA(B, &lda2));
138:   PetscCall(PetscLogGpuTimeBegin());
139:   if (lda1 > m || lda2 > m) {
140:     PetscCallCUDA(cudaMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, cudaMemcpyDeviceToDevice));
141:   } else {
142:     PetscCallCUDA(cudaMemcpy(vb, va, m * (n * sizeof(PetscScalar)), cudaMemcpyDeviceToDevice));
143:   }
144:   PetscCall(PetscLogGpuTimeEnd());
145:   PetscCall(MatDenseCUDARestoreArrayWrite(B, &vb));
146:   PetscCall(MatDenseCUDARestoreArrayRead(A, &va));
147:   PetscFunctionReturn(PETSC_SUCCESS);
148: }

150: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
151: {
152:   PetscScalar *va;
153:   PetscInt     lda, m = A->rmap->n, n = A->cmap->n;

155:   PetscFunctionBegin;
156:   PetscCall(MatDenseCUDAGetArrayWrite(A, &va));
157:   PetscCall(MatDenseGetLDA(A, &lda));
158:   PetscCall(PetscLogGpuTimeBegin());
159:   if (lda > m) {
160:     PetscCallCUDA(cudaMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n));
161:   } else {
162:     PetscCallCUDA(cudaMemset(va, 0, m * (n * sizeof(PetscScalar))));
163:   }
164:   PetscCall(PetscLogGpuTimeEnd());
165:   PetscCall(MatDenseCUDARestoreArrayWrite(A, &va));
166:   PetscFunctionReturn(PETSC_SUCCESS);
167: }

169: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
170: {
171:   Mat_SeqDense     *aa = (Mat_SeqDense *)A->data;
172:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

174:   PetscFunctionBegin;
175:   PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
176:   PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
177:   PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
178:   if (aa->v) PetscCall(MatSeqDenseCUDACopyToGPU(A));
179:   dA->unplacedarray       = dA->d_v;
180:   dA->unplaced_user_alloc = dA->user_alloc;
181:   dA->d_v                 = (PetscScalar *)a;
182:   dA->user_alloc          = PETSC_TRUE;
183:   PetscFunctionReturn(PETSC_SUCCESS);
184: }

186: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
187: {
188:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
189:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

191:   PetscFunctionBegin;
192:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
193:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
194:   if (a->v) PetscCall(MatSeqDenseCUDACopyToGPU(A));
195:   dA->d_v           = dA->unplacedarray;
196:   dA->user_alloc    = dA->unplaced_user_alloc;
197:   dA->unplacedarray = NULL;
198:   PetscFunctionReturn(PETSC_SUCCESS);
199: }

201: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
202: {
203:   Mat_SeqDense     *aa = (Mat_SeqDense *)A->data;
204:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

206:   PetscFunctionBegin;
207:   PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
208:   PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
209:   PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
210:   if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
211:   dA->d_v        = (PetscScalar *)a;
212:   dA->user_alloc = PETSC_FALSE;
213:   PetscFunctionReturn(PETSC_SUCCESS);
214: }

216: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
217: {
218:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

220:   PetscFunctionBegin;
221:   if (!dA->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
222:   *a = dA->d_v;
223:   PetscFunctionReturn(PETSC_SUCCESS);
224: }

226: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat, PetscScalar **a)
227: {
228:   PetscFunctionBegin;
229:   if (a) *a = NULL;
230:   PetscFunctionReturn(PETSC_SUCCESS);
231: }

233: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
234: {
235:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

237:   PetscFunctionBegin;
238:   PetscCall(MatSeqDenseCUDACopyToGPU(A));
239:   *a = dA->d_v;
240:   PetscFunctionReturn(PETSC_SUCCESS);
241: }

243: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat, const PetscScalar **a)
244: {
245:   PetscFunctionBegin;
246:   if (a) *a = NULL;
247:   PetscFunctionReturn(PETSC_SUCCESS);
248: }

250: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
251: {
252:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

254:   PetscFunctionBegin;
255:   PetscCall(MatSeqDenseCUDACopyToGPU(A));
256:   *a = dA->d_v;
257:   PetscFunctionReturn(PETSC_SUCCESS);
258: }

260: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat, PetscScalar **a)
261: {
262:   PetscFunctionBegin;
263:   if (a) *a = NULL;
264:   PetscFunctionReturn(PETSC_SUCCESS);
265: }

267: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
268: {
269: #if PETSC_PKG_CUDA_VERSION_GE(10, 1, 0)
270:   Mat_SeqDense      *a  = (Mat_SeqDense *)A->data;
271:   Mat_SeqDenseCUDA  *dA = (Mat_SeqDenseCUDA *)A->spptr;
272:   PetscScalar       *da;
273:   cusolverDnHandle_t handle;
274:   PetscCuBLASInt     n, lda;
275:   #if defined(PETSC_USE_DEBUG)
276:   PetscCuBLASInt info;
277:   #endif

279:   PetscFunctionBegin;
280:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
281:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
282:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
283:   PetscCall(PetscCuBLASIntCast(a->lda, &lda));
284:   PetscCheck(A->factortype != MAT_FACTOR_LU, PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDngetri not implemented");
285:   if (A->factortype == MAT_FACTOR_CHOLESKY) {
286:     if (!dA->d_fact_ipiv) { /* spd */
287:       PetscCuBLASInt il;

289:       PetscCall(MatDenseCUDAGetArray(A, &da));
290:       PetscCallCUSOLVER(cusolverDnXpotri_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &il));
291:       if (il > dA->fact_lwork) {
292:         dA->fact_lwork = il;

294:         PetscCallCUDA(cudaFree(dA->d_fact_work));
295:         PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
296:       }
297:       PetscCall(PetscLogGpuTimeBegin());
298:       PetscCallCUSOLVER(cusolverDnXpotri(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
299:       PetscCall(PetscLogGpuTimeEnd());
300:       PetscCall(MatDenseCUDARestoreArray(A, &da));
301:       /* TODO (write cuda kernel) */
302:       PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
303:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytri not implemented");
304:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
305:   #if defined(PETSC_USE_DEBUG)
306:   PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
307:   PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: leading minor of order %d is zero", info);
308:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
309:   #endif
310:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
311:   A->ops->solve          = NULL;
312:   A->ops->solvetranspose = NULL;
313:   A->ops->matsolve       = NULL;
314:   A->factortype          = MAT_FACTOR_NONE;

316:   PetscCall(PetscFree(A->solvertype));
317:   PetscFunctionReturn(PETSC_SUCCESS);
318: #else
319:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
320: #endif
321: }

323: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
324: {
325:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
326:   PetscScalar      *y;
327:   PetscCuBLASInt    m = 0, k = 0;
328:   PetscBool         xiscuda, yiscuda, aiscuda;

330:   PetscFunctionBegin;
331:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
332:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
333:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
334:   PetscCall(PetscObjectTypeCompare((PetscObject)xx, VECSEQCUDA, &xiscuda));
335:   PetscCall(PetscObjectTypeCompare((PetscObject)yy, VECSEQCUDA, &yiscuda));
336:   {
337:     const PetscScalar *x;
338:     PetscBool          xishost = PETSC_TRUE;

340:     /* The logic here is to try to minimize the amount of memory copying:
341:        if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
342:        data is not offloaded to the GPU yet, then the data is copied to the
343:        GPU.  But we are only trying to get the data in order to copy it into the y
344:        array.  So the array x will be wherever the data already is so that
345:        only one memcpy is performed */
346:     if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
347:       PetscCall(VecCUDAGetArrayRead(xx, &x));
348:       xishost = PETSC_FALSE;
349:     } else {
350:       PetscCall(VecGetArrayRead(xx, &x));
351:     }
352:     if (k < m || !yiscuda) {
353:       if (!dA->workvec) PetscCall(VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
354:       PetscCall(VecCUDAGetArrayWrite(dA->workvec, &y));
355:     } else {
356:       PetscCall(VecCUDAGetArrayWrite(yy, &y));
357:     }
358:     PetscCallCUDA(cudaMemcpy(y, x, m * sizeof(PetscScalar), xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice));
359:   }
360:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda));
361:   if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
362:   PetscCall((*matsolve)(A, y, m, m, 1, k, transpose));
363:   if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
364:   if (k < m || !yiscuda) {
365:     PetscScalar *yv;

367:     /* The logic here is that the data is not yet in either yy's GPU array or its
368:        CPU array.  There is nothing in the interface to say where the user would like
369:        it to end up.  So we choose the GPU, because it is the faster option */
370:     if (yiscuda) {
371:       PetscCall(VecCUDAGetArrayWrite(yy, &yv));
372:     } else {
373:       PetscCall(VecGetArray(yy, &yv));
374:     }
375:     PetscCallCUDA(cudaMemcpy(yv, y, k * sizeof(PetscScalar), yiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
376:     if (yiscuda) {
377:       PetscCall(VecCUDARestoreArrayWrite(yy, &yv));
378:     } else {
379:       PetscCall(VecRestoreArray(yy, &yv));
380:     }
381:     PetscCall(VecCUDARestoreArrayWrite(dA->workvec, &y));
382:   } else {
383:     PetscCall(VecCUDARestoreArrayWrite(yy, &y));
384:   }
385:   PetscFunctionReturn(PETSC_SUCCESS);
386: }

388: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
389: {
390:   PetscScalar   *y;
391:   PetscInt       n, _ldb, _ldx;
392:   PetscBool      biscuda, xiscuda, aiscuda;
393:   PetscCuBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;

395:   PetscFunctionBegin;
396:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
397:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
398:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
399:   PetscCall(MatGetSize(B, NULL, &n));
400:   PetscCall(PetscCuBLASIntCast(n, &nrhs));
401:   PetscCall(MatDenseGetLDA(B, &_ldb));
402:   PetscCall(PetscCuBLASIntCast(_ldb, &ldb));
403:   PetscCall(MatDenseGetLDA(X, &_ldx));
404:   PetscCall(PetscCuBLASIntCast(_ldx, &ldx));

406:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
407:   PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSECUDA, &xiscuda));
408:   {
409:     /* The logic here is to try to minimize the amount of memory copying:
410:        if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
411:        data is not offloaded to the GPU yet, then the data is copied to the
412:        GPU.  But we are only trying to get the data in order to copy it into the y
413:        array.  So the array b will be wherever the data already is so that
414:        only one memcpy is performed */
415:     const PetscScalar *b;

417:     /* some copying from B will be involved */
418:     PetscBool bishost = PETSC_TRUE;

420:     if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
421:       PetscCall(MatDenseCUDAGetArrayRead(B, &b));
422:       bishost = PETSC_FALSE;
423:     } else {
424:       PetscCall(MatDenseGetArrayRead(B, &b));
425:     }
426:     if (ldx < m || !xiscuda) {
427:       /* X's array cannot serve as the array (too small or not on device), B's
428:        * array cannot serve as the array (const), so allocate a new array  */
429:       ldy = m;
430:       PetscCallCUDA(cudaMalloc((void **)&y, nrhs * m * sizeof(PetscScalar)));
431:     } else {
432:       /* X's array should serve as the array */
433:       ldy = ldx;
434:       PetscCall(MatDenseCUDAGetArrayWrite(X, &y));
435:     }
436:     PetscCallCUDA(cudaMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice));
437:     if (bishost) {
438:       PetscCall(MatDenseRestoreArrayRead(B, &b));
439:     } else {
440:       PetscCall(MatDenseCUDARestoreArrayRead(B, &b));
441:     }
442:   }
443:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda));
444:   if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
445:   PetscCall((*matsolve)(A, y, ldy, m, nrhs, k, transpose));
446:   if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
447:   if (ldx < m || !xiscuda) {
448:     PetscScalar *x;

450:     /* The logic here is that the data is not yet in either X's GPU array or its
451:        CPU array.  There is nothing in the interface to say where the user would like
452:        it to end up.  So we choose the GPU, because it is the faster option */
453:     if (xiscuda) {
454:       PetscCall(MatDenseCUDAGetArrayWrite(X, &x));
455:     } else {
456:       PetscCall(MatDenseGetArray(X, &x));
457:     }
458:     PetscCallCUDA(cudaMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
459:     if (xiscuda) {
460:       PetscCall(MatDenseCUDARestoreArrayWrite(X, &x));
461:     } else {
462:       PetscCall(MatDenseRestoreArray(X, &x));
463:     }
464:     PetscCallCUDA(cudaFree(y));
465:   } else {
466:     PetscCall(MatDenseCUDARestoreArrayWrite(X, &y));
467:   }
468:   PetscFunctionReturn(PETSC_SUCCESS);
469: }

471: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
472: {
473:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
474:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
475:   const PetscScalar *da;
476:   PetscCuBLASInt     lda;
477:   cusolverDnHandle_t handle;
478:   int                info;

480:   PetscFunctionBegin;
481:   PetscCall(MatDenseCUDAGetArrayRead(A, &da));
482:   PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
483:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
484:   PetscCall(PetscLogGpuTimeBegin());
485:   PetscCall(PetscInfo(A, "LU solve %d x %d on backend\n", m, k));
486:   PetscCallCUSOLVER(cusolverDnXgetrs(handle, T ? CUBLAS_OP_T : CUBLAS_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_info));
487:   PetscCall(PetscLogGpuTimeEnd());
488:   PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
489:   if (PetscDefined(USE_DEBUG)) {
490:     PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
491:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
492:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
493:   }
494:   PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
495:   PetscFunctionReturn(PETSC_SUCCESS);
496: }

498: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool)
499: {
500:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
501:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
502:   const PetscScalar *da;
503:   PetscCuBLASInt     lda;
504:   cusolverDnHandle_t handle;
505:   int                info;

507:   PetscFunctionBegin;
508:   PetscCall(MatDenseCUDAGetArrayRead(A, &da));
509:   PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
510:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
511:   PetscCall(PetscLogGpuTimeBegin());
512:   PetscCall(PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k));
513:   if (!dA->d_fact_ipiv) { /* spd */
514:     /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
515:     PetscCallCUSOLVER(cusolverDnXpotrs(handle, CUBLAS_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_info));
516:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytrs not implemented");
517:   PetscCall(PetscLogGpuTimeEnd());
518:   PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
519:   if (PetscDefined(USE_DEBUG)) {
520:     PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
521:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
522:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
523:   }
524:   PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
525:   PetscFunctionReturn(PETSC_SUCCESS);
526: }

528: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
529: {
530:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
531:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
532:   const PetscScalar *da;
533:   PetscCuBLASInt     lda, rank;
534:   cusolverDnHandle_t handle;
535:   cublasHandle_t     bhandle;
536:   int                info;
537:   cublasOperation_t  trans;
538:   PetscScalar        one = 1.;

540:   PetscFunctionBegin;
541:   PetscCall(PetscCuBLASIntCast(mat->rank, &rank));
542:   PetscCall(MatDenseCUDAGetArrayRead(A, &da));
543:   PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
544:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
545:   PetscCall(PetscCUBLASGetHandle(&bhandle));
546:   PetscCall(PetscLogGpuTimeBegin());
547:   PetscCall(PetscInfo(A, "QR solve %d x %d on backend\n", m, k));
548:   if (!T) {
549:     if (PetscDefined(USE_COMPLEX)) {
550:       trans = CUBLAS_OP_C;
551:     } else {
552:       trans = CUBLAS_OP_T;
553:     }
554:     PetscCallCUSOLVER(cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
555:     if (PetscDefined(USE_DEBUG)) {
556:       PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
557:       PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
558:     }
559:     PetscCallCUBLAS(cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
560:   } else {
561:     PetscCallCUBLAS(cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
562:     PetscCallCUSOLVER(cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
563:     if (PetscDefined(USE_DEBUG)) {
564:       PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
565:       PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
566:     }
567:   }
568:   PetscCall(PetscLogGpuTimeEnd());
569:   PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
570:   PetscCall(PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank))));
571:   PetscFunctionReturn(PETSC_SUCCESS);
572: }

574: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
575: {
576:   PetscFunctionBegin;
577:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU));
578:   PetscFunctionReturn(PETSC_SUCCESS);
579: }

581: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
582: {
583:   PetscFunctionBegin;
584:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU));
585:   PetscFunctionReturn(PETSC_SUCCESS);
586: }

588: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
589: {
590:   PetscFunctionBegin;
591:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
592:   PetscFunctionReturn(PETSC_SUCCESS);
593: }

595: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
596: {
597:   PetscFunctionBegin;
598:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
599:   PetscFunctionReturn(PETSC_SUCCESS);
600: }

602: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
603: {
604:   PetscFunctionBegin;
605:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR));
606:   PetscFunctionReturn(PETSC_SUCCESS);
607: }

609: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
610: {
611:   PetscFunctionBegin;
612:   PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR));
613:   PetscFunctionReturn(PETSC_SUCCESS);
614: }

616: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
617: {
618:   PetscFunctionBegin;
619:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU));
620:   PetscFunctionReturn(PETSC_SUCCESS);
621: }

623: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
624: {
625:   PetscFunctionBegin;
626:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU));
627:   PetscFunctionReturn(PETSC_SUCCESS);
628: }

630: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
631: {
632:   PetscFunctionBegin;
633:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
634:   PetscFunctionReturn(PETSC_SUCCESS);
635: }

637: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
638: {
639:   PetscFunctionBegin;
640:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
641:   PetscFunctionReturn(PETSC_SUCCESS);
642: }

644: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
645: {
646:   PetscFunctionBegin;
647:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR));
648:   PetscFunctionReturn(PETSC_SUCCESS);
649: }

651: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
652: {
653:   PetscFunctionBegin;
654:   PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR));
655:   PetscFunctionReturn(PETSC_SUCCESS);
656: }

658: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A, IS, IS, const MatFactorInfo *)
659: {
660:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
661:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
662:   PetscScalar      *da;
663:   PetscCuBLASInt    m, n, lda;
664: #if defined(PETSC_USE_DEBUG)
665:   int info;
666: #endif
667:   cusolverDnHandle_t handle;

669:   PetscFunctionBegin;
670:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
671:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
672:   PetscCall(MatDenseGetArrayAndMemType(A, &da, nullptr));
673:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
674:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
675:   PetscCall(PetscCuBLASIntCast(a->lda, &lda));
676:   PetscCall(PetscInfo(A, "LU factor %d x %d on backend\n", m, n));
677:   if (!dA->d_fact_ipiv) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
678:   if (!dA->fact_lwork) {
679:     PetscCallCUSOLVER(cusolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
680:     PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
681:   }
682:   if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
683:   PetscCall(PetscLogGpuTimeBegin());
684:   PetscCallCUSOLVER(cusolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->d_fact_ipiv, dA->d_fact_info));
685:   PetscCall(PetscLogGpuTimeEnd());
686:   PetscCall(MatDenseRestoreArrayAndMemType(A, &da));
687: #if defined(PETSC_USE_DEBUG)
688:   PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
689:   PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
690:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
691: #endif
692:   A->factortype = MAT_FACTOR_LU;
693:   PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));

695:   A->ops->solve             = MatSolve_SeqDenseCUDA_LU;
696:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_LU;
697:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_LU;
698:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;

700:   PetscCall(PetscFree(A->solvertype));
701:   PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
702:   PetscFunctionReturn(PETSC_SUCCESS);
703: }

705: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A, IS, const MatFactorInfo *)
706: {
707:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
708:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
709:   PetscScalar      *da;
710:   PetscCuBLASInt    n, lda;
711: #if defined(PETSC_USE_DEBUG)
712:   int info;
713: #endif
714:   cusolverDnHandle_t handle;

716:   PetscFunctionBegin;
717:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
718:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
719:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &n));
720:   PetscCall(PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n));
721:   if (A->spd == PETSC_BOOL3_TRUE) {
722:     PetscCall(MatDenseCUDAGetArray(A, &da));
723:     PetscCall(PetscCuBLASIntCast(a->lda, &lda));
724:     if (!dA->fact_lwork) {
725:       PetscCallCUSOLVER(cusolverDnXpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork));
726:       PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
727:     }
728:     if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
729:     PetscCall(PetscLogGpuTimeBegin());
730:     PetscCallCUSOLVER(cusolverDnXpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
731:     PetscCall(PetscLogGpuTimeEnd());

733:     PetscCall(MatDenseCUDARestoreArray(A, &da));
734: #if defined(PETSC_USE_DEBUG)
735:     PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
736:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
737:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
738: #endif
739:     A->factortype = MAT_FACTOR_CHOLESKY;
740:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
741:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
742: #if 0
743:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
744:        The code below should work, and it can be activated when *sytrs routines will be available */
745:     if (!dA->d_fact_ipiv) {
746:       PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv)));
747:     }
748:     if (!dA->fact_lwork) {
749:       PetscCallCUSOLVER(cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork));
750:       PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work)));
751:     }
752:     if (!dA->d_fact_info) {
753:       PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info)));
754:     }
755:     PetscCall(PetscLogGpuTimeBegin());
756:     PetscCallCUSOLVER(cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info));
757:     PetscCall(PetscLogGpuTimeEnd());
758: #endif

760:   A->ops->solve             = MatSolve_SeqDenseCUDA_Cholesky;
761:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_Cholesky;
762:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_Cholesky;
763:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
764:   PetscCall(PetscFree(A->solvertype));
765:   PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
766:   PetscFunctionReturn(PETSC_SUCCESS);
767: }

769: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A, IS, const MatFactorInfo *)
770: {
771:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
772:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
773:   PetscScalar      *da;
774:   PetscCuBLASInt    m, min, max, n, lda;
775: #if defined(PETSC_USE_DEBUG)
776:   int info;
777: #endif
778:   cusolverDnHandle_t handle;

780:   PetscFunctionBegin;
781:   if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
782:   PetscCall(PetscCUSOLVERDnGetHandle(&handle));
783:   PetscCall(MatDenseCUDAGetArray(A, &da));
784:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
785:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
786:   PetscCall(PetscCuBLASIntCast(a->lda, &lda));
787:   PetscCall(PetscInfo(A, "QR factor %d x %d on backend\n", m, n));
788:   max = PetscMax(m, n);
789:   min = PetscMin(m, n);
790:   if (!dA->d_fact_tau) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau)));
791:   if (!dA->d_fact_ipiv) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
792:   if (!dA->fact_lwork) {
793:     PetscCallCUSOLVER(cusolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
794:     PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
795:   }
796:   if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
797:   if (!dA->workvec) PetscCall(VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
798:   PetscCall(PetscLogGpuTimeBegin());
799:   PetscCallCUSOLVER(cusolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
800:   PetscCall(PetscLogGpuTimeEnd());
801:   PetscCall(MatDenseCUDARestoreArray(A, &da));
802: #if defined(PETSC_USE_DEBUG)
803:   PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
804:   PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
805: #endif
806:   A->factortype = MAT_FACTOR_QR;
807:   a->rank       = min;
808:   PetscCall(PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0)));

810:   A->ops->solve             = MatSolve_SeqDenseCUDA_QR;
811:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_QR;
812:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_QR;
813:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;

815:   PetscCall(PetscFree(A->solvertype));
816:   PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
817:   PetscFunctionReturn(PETSC_SUCCESS);
818: }

820: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
821: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
822: {
823:   const PetscScalar *da, *db;
824:   PetscScalar       *dc;
825:   PetscScalar        one = 1.0, zero = 0.0;
826:   PetscCuBLASInt     m, n, k;
827:   PetscInt           alda, blda, clda;
828:   cublasHandle_t     cublasv2handle;
829:   PetscBool          Aiscuda, Biscuda;

831:   PetscFunctionBegin;
832:   /* we may end up with SEQDENSE as one of the arguments */
833:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &Aiscuda));
834:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &Biscuda));
835:   if (!Aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
836:   if (!Biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
837:   PetscCall(PetscCuBLASIntCast(C->rmap->n, &m));
838:   PetscCall(PetscCuBLASIntCast(C->cmap->n, &n));
839:   if (tA) PetscCall(PetscCuBLASIntCast(A->rmap->n, &k));
840:   else PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
841:   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
842:   PetscCall(PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n));
843:   PetscCall(MatDenseCUDAGetArrayRead(A, &da));
844:   PetscCall(MatDenseCUDAGetArrayRead(B, &db));
845:   PetscCall(MatDenseCUDAGetArrayWrite(C, &dc));
846:   PetscCall(MatDenseGetLDA(A, &alda));
847:   PetscCall(MatDenseGetLDA(B, &blda));
848:   PetscCall(MatDenseGetLDA(C, &clda));
849:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
850:   PetscCall(PetscLogGpuTimeBegin());
851:   PetscCallCUBLAS(cublasXgemm(cublasv2handle, tA ? CUBLAS_OP_T : CUBLAS_OP_N, tB ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda));
852:   PetscCall(PetscLogGpuTimeEnd());
853:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
854:   PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
855:   PetscCall(MatDenseCUDARestoreArrayRead(B, &db));
856:   PetscCall(MatDenseCUDARestoreArrayWrite(C, &dc));
857:   if (!Aiscuda) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
858:   if (!Biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
859:   PetscFunctionReturn(PETSC_SUCCESS);
860: }

862: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
863: {
864:   PetscFunctionBegin;
865:   PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_TRUE, PETSC_FALSE));
866:   PetscFunctionReturn(PETSC_SUCCESS);
867: }

869: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
870: {
871:   PetscFunctionBegin;
872:   PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_FALSE));
873:   PetscFunctionReturn(PETSC_SUCCESS);
874: }

876: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
877: {
878:   PetscFunctionBegin;
879:   PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_TRUE));
880:   PetscFunctionReturn(PETSC_SUCCESS);
881: }

883: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
884: {
885:   PetscFunctionBegin;
886:   PetscCall(MatProductSetFromOptions_SeqDense(C));
887:   PetscFunctionReturn(PETSC_SUCCESS);
888: }

890: /* zz = op(A)*xx + yy
891:    if yy == NULL, only MatMult */
892: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
893: {
894:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
895:   const PetscScalar *xarray, *da;
896:   PetscScalar       *zarray;
897:   PetscScalar        one = 1.0, zero = 0.0;
898:   PetscCuBLASInt     m, n, lda;
899:   cublasHandle_t     cublasv2handle;

901:   PetscFunctionBegin;
902:   if (yy && yy != zz) PetscCall(VecSeq_CUDA::copy(yy, zz)); /* mult add */
903:   if (!A->rmap->n || !A->cmap->n) {
904:     /* mult only */
905:     if (!yy) PetscCall(VecSeq_CUDA::set(zz, 0.0));
906:     PetscFunctionReturn(PETSC_SUCCESS);
907:   }
908:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n));
909:   PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
910:   PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
911:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
912:   PetscCall(MatDenseCUDAGetArrayRead(A, &da));
913:   PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
914:   PetscCall(VecCUDAGetArrayRead(xx, &xarray));
915:   PetscCall(VecCUDAGetArray(zz, &zarray));
916:   PetscCall(PetscLogGpuTimeBegin());
917:   PetscCallCUBLAS(cublasXgemv(cublasv2handle, trans ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1));
918:   PetscCall(PetscLogGpuTimeEnd());
919:   PetscCall(PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n)));
920:   PetscCall(VecCUDARestoreArrayRead(xx, &xarray));
921:   PetscCall(VecCUDARestoreArray(zz, &zarray));
922:   PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
923:   PetscFunctionReturn(PETSC_SUCCESS);
924: }

926: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
927: {
928:   PetscFunctionBegin;
929:   PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_FALSE));
930:   PetscFunctionReturn(PETSC_SUCCESS);
931: }

933: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
934: {
935:   PetscFunctionBegin;
936:   PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_TRUE));
937:   PetscFunctionReturn(PETSC_SUCCESS);
938: }

940: PetscErrorCode MatMult_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
941: {
942:   PetscFunctionBegin;
943:   PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_FALSE));
944:   PetscFunctionReturn(PETSC_SUCCESS);
945: }

947: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
948: {
949:   PetscFunctionBegin;
950:   PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_TRUE));
951:   PetscFunctionReturn(PETSC_SUCCESS);
952: }

954: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **array)
955: {
956:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

958:   PetscFunctionBegin;
959:   PetscCall(MatSeqDenseCUDACopyFromGPU(A));
960:   *array = mat->v;
961:   PetscFunctionReturn(PETSC_SUCCESS);
962: }

964: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **array)
965: {
966:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

968:   PetscFunctionBegin;
969:   /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
970:   if (!mat->v) PetscCall(MatSeqDenseSetPreallocation(A, NULL));
971:   *array         = mat->v;
972:   A->offloadmask = PETSC_OFFLOAD_CPU;
973:   PetscFunctionReturn(PETSC_SUCCESS);
974: }

976: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A, PetscScalar **array)
977: {
978:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

980:   PetscFunctionBegin;
981:   PetscCall(MatSeqDenseCUDACopyFromGPU(A));
982:   *array         = mat->v;
983:   A->offloadmask = PETSC_OFFLOAD_CPU;
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: static PetscErrorCode MatDenseGetArrayAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array, PetscMemType *mtype)
988: {
989:   const auto dA = static_cast<Mat_SeqDenseCUDA *>(A->spptr);

991:   PetscFunctionBegin;
992:   PetscCall(MatSeqDenseCUDACopyToGPU(A)); // Since we will read the array on device, we sync the GPU data if necessary
993:   *array = dA->d_v;
994:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
995:   PetscFunctionReturn(PETSC_SUCCESS);
996: }

998: static PetscErrorCode MatDenseRestoreArrayAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array)
999: {
1000:   PetscFunctionBegin;
1001:   *array         = nullptr;
1002:   A->offloadmask = PETSC_OFFLOAD_GPU; // Since we've written to the array on device
1003:   PetscFunctionReturn(PETSC_SUCCESS);
1004: }

1006: static PetscErrorCode (*MatDenseGetArrayReadAndMemType_SeqDenseCUDA)(Mat, PetscScalar **, PetscMemType *) = MatDenseGetArrayAndMemType_SeqDenseCUDA;
1007: static PetscErrorCode (*MatDenseRestoreArrayReadAndMemType_SeqDenseCUDA)(Mat, PetscScalar **)             = nullptr; // Keep the offload mask as is

1009: static PetscErrorCode MatDenseGetArrayWriteAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array, PetscMemType *mtype)
1010: {
1011:   const auto dA = static_cast<Mat_SeqDenseCUDA *>(A->spptr);

1013:   PetscFunctionBegin;
1014:   if (!dA->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(A, nullptr)); // Allocate GPU memory if not present
1015:   *array = dA->d_v;
1016:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1017:   PetscFunctionReturn(PETSC_SUCCESS);
1018: }

1020: static PetscErrorCode (*MatDenseRestoreArrayWriteAndMemType_SeqDenseCUDA)(Mat, PetscScalar **) = MatDenseRestoreArrayAndMemType_SeqDenseCUDA; // Since we've written to the array on device

1022: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y, PetscScalar alpha)
1023: {
1024:   Mat_SeqDense  *y = (Mat_SeqDense *)Y->data;
1025:   PetscScalar   *dy;
1026:   PetscCuBLASInt j, N, m, lday, one = 1;
1027:   cublasHandle_t cublasv2handle;

1029:   PetscFunctionBegin;
1030:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1031:   PetscCall(MatDenseCUDAGetArray(Y, &dy));
1032:   PetscCall(PetscCuBLASIntCast(Y->rmap->n * Y->cmap->n, &N));
1033:   PetscCall(PetscCuBLASIntCast(Y->rmap->n, &m));
1034:   PetscCall(PetscCuBLASIntCast(y->lda, &lday));
1035:   PetscCall(PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1036:   PetscCall(PetscLogGpuTimeBegin());
1037:   if (lday > m) {
1038:     for (j = 0; j < Y->cmap->n; j++) PetscCallCUBLAS(cublasXscal(cublasv2handle, m, &alpha, dy + lday * j, one));
1039:   } else PetscCallCUBLAS(cublasXscal(cublasv2handle, N, &alpha, dy, one));
1040:   PetscCall(PetscLogGpuTimeEnd());
1041:   PetscCall(PetscLogGpuFlops(N));
1042:   PetscCall(MatDenseCUDARestoreArray(Y, &dy));
1043:   PetscFunctionReturn(PETSC_SUCCESS);
1044: }

1046: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1047:   const PetscScalar shift_;
1048:   petscshift(PetscScalar shift) : shift_(shift) { }
1049:   __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1050: };

1052: template <typename Iterator>
1053: class strided_range {
1054: public:
1055:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1056:   struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1057:     difference_type stride;
1058:     stride_functor(difference_type stride) : stride(stride) { }
1059:     __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1060:   };
1061:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
1062:   typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1063:   typedef typename thrust::permutation_iterator<Iterator, TransformIterator>    PermutationIterator;
1064:   typedef PermutationIterator                                                   iterator; // type of the strided_range iterator
1065:   // construct strided_range for the range [first,last)
1066:   strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1067:   iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1068:   iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }

1070: protected:
1071:   Iterator        first;
1072:   Iterator        last;
1073:   difference_type stride;
1074: };

1076: PetscErrorCode MatShift_DenseCUDA_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1077: {
1078:   PetscFunctionBegin;
1079:   PetscInt rend2 = PetscMin(rend, cols);
1080:   if (rend2 > rstart) {
1081:     PetscCall(PetscLogGpuTimeBegin());
1082:     try {
1083:       const auto                                                  dptr  = thrust::device_pointer_cast(da);
1084:       size_t                                                      begin = rstart * lda;
1085:       size_t                                                      end   = rend2 - rstart + rend2 * lda;
1086:       strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1087:       thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1088:     } catch (char *ex) {
1089:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1090:     }
1091:     PetscCall(PetscLogGpuTimeEnd());
1092:     PetscCall(PetscLogGpuFlops(rend2 - rstart));
1093:   }
1094:   PetscFunctionReturn(PETSC_SUCCESS);
1095: }

1097: PetscErrorCode MatShift_SeqDenseCUDA(Mat A, PetscScalar alpha)
1098: {
1099:   PetscScalar *da;
1100:   PetscInt     m = A->rmap->n, n = A->cmap->n, lda;

1102:   PetscFunctionBegin;
1103:   PetscCall(MatDenseCUDAGetArray(A, &da));
1104:   PetscCall(MatDenseGetLDA(A, &lda));
1105:   PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1106:   PetscCall(MatShift_DenseCUDA_Private(da, alpha, lda, 0, m, n));
1107:   PetscCall(MatDenseCUDARestoreArray(A, &da));
1108:   PetscFunctionReturn(PETSC_SUCCESS);
1109: }

1111: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y, PetscScalar alpha, Mat X, MatStructure)
1112: {
1113:   Mat_SeqDense      *x = (Mat_SeqDense *)X->data;
1114:   Mat_SeqDense      *y = (Mat_SeqDense *)Y->data;
1115:   const PetscScalar *dx;
1116:   PetscScalar       *dy;
1117:   PetscCuBLASInt     j, N, m, ldax, lday, one = 1;
1118:   cublasHandle_t     cublasv2handle;

1120:   PetscFunctionBegin;
1121:   if (!X->rmap->n || !X->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1122:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1123:   PetscCall(MatDenseCUDAGetArrayRead(X, &dx));
1124:   if (alpha == 0.0) PetscCall(MatDenseCUDAGetArrayWrite(Y, &dy));
1125:   else PetscCall(MatDenseCUDAGetArray(Y, &dy));
1126:   PetscCall(PetscCuBLASIntCast(X->rmap->n * X->cmap->n, &N));
1127:   PetscCall(PetscCuBLASIntCast(X->rmap->n, &m));
1128:   PetscCall(PetscCuBLASIntCast(x->lda, &ldax));
1129:   PetscCall(PetscCuBLASIntCast(y->lda, &lday));
1130:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1131:   PetscCall(PetscLogGpuTimeBegin());
1132:   if (ldax > m || lday > m) {
1133:     for (j = 0; j < X->cmap->n; j++) PetscCallCUBLAS(cublasXaxpy(cublasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one));
1134:   } else PetscCallCUBLAS(cublasXaxpy(cublasv2handle, N, &alpha, dx, one, dy, one));
1135:   PetscCall(PetscLogGpuTimeEnd());
1136:   PetscCall(PetscLogGpuFlops(PetscMax(2. * N - 1, 0)));
1137:   PetscCall(MatDenseCUDARestoreArrayRead(X, &dx));
1138:   if (alpha == 0.0) PetscCall(MatDenseCUDARestoreArrayWrite(Y, &dy));
1139:   else PetscCall(MatDenseCUDARestoreArray(Y, &dy));
1140:   PetscFunctionReturn(PETSC_SUCCESS);
1141: }

1143: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1144: {
1145:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

1147:   PetscFunctionBegin;
1148:   if (dA) {
1149:     PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
1150:     if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
1151:     PetscCallCUDA(cudaFree(dA->d_fact_tau));
1152:     PetscCallCUDA(cudaFree(dA->d_fact_ipiv));
1153:     PetscCallCUDA(cudaFree(dA->d_fact_info));
1154:     PetscCallCUDA(cudaFree(dA->d_fact_work));
1155:     PetscCall(VecDestroy(&dA->workvec));
1156:   }
1157:   PetscCall(PetscFree(A->spptr));
1158:   PetscFunctionReturn(PETSC_SUCCESS);
1159: }

1161: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1162: {
1163:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1165:   PetscFunctionBegin;
1166:   /* prevent to copy back data if we own the data pointer */
1167:   if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1168:   PetscCall(MatConvert_SeqDenseCUDA_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1169:   PetscCall(MatDestroy_SeqDense(A));
1170:   PetscFunctionReturn(PETSC_SUCCESS);
1171: }

1173: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
1174: {
1175:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1177:   PetscFunctionBegin;
1178:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1179:   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1180:   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
1181:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues));
1182:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) PetscCall(MatCopy_SeqDenseCUDA(A, *B, SAME_NONZERO_PATTERN));
1183:   if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1184:     Mat_SeqDenseCUDA *dB = (Mat_SeqDenseCUDA *)(*B)->spptr;
1185:     if (!dB->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(*B, NULL));
1186:   }
1187:   PetscFunctionReturn(PETSC_SUCCESS);
1188: }

1190: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A, Vec v, PetscInt col)
1191: {
1192:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
1193:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1194:   PetscScalar      *x;
1195:   PetscBool         viscuda;

1197:   PetscFunctionBegin;
1198:   PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, VECCUDA, ""));
1199:   if (viscuda && !v->boundtocpu) { /* update device data */
1200:     PetscCall(VecCUDAGetArrayWrite(v, &x));
1201:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1202:       PetscCallCUDA(cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToHost));
1203:     } else {
1204:       PetscCallCUDA(cudaMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
1205:     }
1206:     PetscCall(VecCUDARestoreArrayWrite(v, &x));
1207:   } else { /* update host data */
1208:     PetscCall(VecGetArrayWrite(v, &x));
1209:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1210:       PetscCall(PetscArraycpy(x, a->v + col * a->lda, A->rmap->n));
1211:     } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1212:       PetscCallCUDA(cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1213:     }
1214:     PetscCall(VecRestoreArrayWrite(v, &x));
1215:   }
1216:   PetscFunctionReturn(PETSC_SUCCESS);
1217: }

1219: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A, MatFactorType ftype, Mat *fact)
1220: {
1221:   PetscFunctionBegin;
1222:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), fact));
1223:   PetscCall(MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1224:   PetscCall(MatSetType(*fact, MATSEQDENSECUDA));
1225:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1226:     (*fact)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1227:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1228:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1229:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1230:   } else if (ftype == MAT_FACTOR_QR) {
1231:     PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense));
1232:     PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1233:   }
1234:   (*fact)->factortype = ftype;
1235:   PetscCall(PetscFree((*fact)->solvertype));
1236:   PetscCall(PetscStrallocpy(MATSOLVERCUDA, &(*fact)->solvertype));
1237:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]));
1238:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]));
1239:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]));
1240:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]));
1241:   PetscFunctionReturn(PETSC_SUCCESS);
1242: }

1244: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1245: {
1246:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1248:   PetscFunctionBegin;
1249:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1250:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1251:   PetscCall(MatDenseCUDAGetArray(A, (PetscScalar **)&a->ptrinuse));
1252:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1253:     PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1254:   }
1255:   a->vecinuse = col + 1;
1256:   PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1257:   *v = a->cvec;
1258:   PetscFunctionReturn(PETSC_SUCCESS);
1259: }

1261: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1262: {
1263:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1265:   PetscFunctionBegin;
1266:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1267:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1268:   a->vecinuse = 0;
1269:   PetscCall(VecCUDAResetArray(a->cvec));
1270:   PetscCall(MatDenseCUDARestoreArray(A, (PetscScalar **)&a->ptrinuse));
1271:   if (v) *v = NULL;
1272:   PetscFunctionReturn(PETSC_SUCCESS);
1273: }

1275: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1276: {
1277:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1279:   PetscFunctionBegin;
1280:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1281:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1282:   PetscCall(MatDenseCUDAGetArrayRead(A, &a->ptrinuse));
1283:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1284:     PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1285:   }
1286:   a->vecinuse = col + 1;
1287:   PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1288:   PetscCall(VecLockReadPush(a->cvec));
1289:   *v = a->cvec;
1290:   PetscFunctionReturn(PETSC_SUCCESS);
1291: }

1293: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1294: {
1295:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1297:   PetscFunctionBegin;
1298:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1299:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1300:   a->vecinuse = 0;
1301:   PetscCall(VecLockReadPop(a->cvec));
1302:   PetscCall(VecCUDAResetArray(a->cvec));
1303:   PetscCall(MatDenseCUDARestoreArrayRead(A, &a->ptrinuse));
1304:   if (v) *v = NULL;
1305:   PetscFunctionReturn(PETSC_SUCCESS);
1306: }

1308: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1309: {
1310:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1312:   PetscFunctionBegin;
1313:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1314:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1315:   PetscCall(MatDenseCUDAGetArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1316:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1317:     PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1318:   }
1319:   a->vecinuse = col + 1;
1320:   PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1321:   *v = a->cvec;
1322:   PetscFunctionReturn(PETSC_SUCCESS);
1323: }

1325: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1326: {
1327:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1329:   PetscFunctionBegin;
1330:   PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1331:   PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1332:   a->vecinuse = 0;
1333:   PetscCall(VecCUDAResetArray(a->cvec));
1334:   PetscCall(MatDenseCUDARestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1335:   if (v) *v = NULL;
1336:   PetscFunctionReturn(PETSC_SUCCESS);
1337: }

1339: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1340: {
1341:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
1342:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

1344:   PetscFunctionBegin;
1345:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1346:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1347:   if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) PetscCall(MatDestroy(&a->cmat));
1348:   PetscCall(MatSeqDenseCUDACopyToGPU(A));
1349:   if (!a->cmat) {
1350:     PetscCall(MatCreateDenseCUDA(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat));
1351:   } else {
1352:     PetscCall(MatDenseCUDAPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda));
1353:   }
1354:   PetscCall(MatDenseSetLDA(a->cmat, a->lda));
1355:   /* Place CPU array if present but not copy any data */
1356:   a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1357:   if (a->v) PetscCall(MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda));
1358:   a->cmat->offloadmask = A->offloadmask;
1359:   a->matinuse          = cbegin + 1;
1360:   *v                   = a->cmat;
1361:   PetscFunctionReturn(PETSC_SUCCESS);
1362: }

1364: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A, Mat *v)
1365: {
1366:   Mat_SeqDense    *a    = (Mat_SeqDense *)A->data;
1367:   PetscBool        copy = PETSC_FALSE, reset;
1368:   PetscOffloadMask suboff;

1370:   PetscFunctionBegin;
1371:   PetscCheck(a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1372:   PetscCheck(a->cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1373:   PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1374:   a->matinuse = 0;
1375:   reset       = a->v ? PETSC_TRUE : PETSC_FALSE;
1376:   suboff      = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1377:   if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1378:     copy = PETSC_TRUE;
1379:     PetscCall(MatSeqDenseSetPreallocation(A, NULL));
1380:   }
1381:   PetscCall(MatDenseCUDAResetArray(a->cmat));
1382:   if (reset) PetscCall(MatDenseResetArray(a->cmat));
1383:   if (copy) {
1384:     PetscCall(MatSeqDenseCUDACopyFromGPU(A));
1385:   } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1386:   a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1387:   if (v) *v = NULL;
1388:   PetscFunctionReturn(PETSC_SUCCESS);
1389: }

1391: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A, PetscInt lda)
1392: {
1393:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
1394:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1395:   PetscBool         data;

1397:   PetscFunctionBegin;
1398:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1399:   PetscCheck(dA->user_alloc || !data || cA->lda == lda, PETSC_COMM_SELF, PETSC_ERR_ORDER, "LDA cannot be changed after allocation of internal storage");
1400:   PetscCheck(lda >= A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "LDA %" PetscInt_FMT " must be at least matrix dimension %" PetscInt_FMT, lda, A->rmap->n);
1401:   cA->lda = lda;
1402:   PetscFunctionReturn(PETSC_SUCCESS);
1403: }

1405: static PetscErrorCode MatSetUp_SeqDenseCUDA(Mat A)
1406: {
1407:   PetscFunctionBegin;
1408:   PetscCall(PetscLayoutSetUp(A->rmap));
1409:   PetscCall(PetscLayoutSetUp(A->cmap));
1410:   if (!A->preallocated) PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
1411:   PetscFunctionReturn(PETSC_SUCCESS);
1412: }

1414: static PetscErrorCode MatSetRandom_SeqDenseCUDA(Mat A, PetscRandom r)
1415: {
1416:   PetscBool iscurand;

1418:   PetscFunctionBegin;
1419:   PetscCall(PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand));
1420:   if (iscurand) {
1421:     PetscScalar *a;
1422:     PetscInt     lda, m, n, mn = 0;

1424:     PetscCall(MatGetSize(A, &m, &n));
1425:     PetscCall(MatDenseGetLDA(A, &lda));
1426:     PetscCall(MatDenseCUDAGetArrayWrite(A, &a));
1427:     if (lda > m) {
1428:       for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(r, m, a + i * lda));
1429:     } else {
1430:       PetscCall(PetscIntMultError(m, n, &mn));
1431:       PetscCall(PetscRandomGetValues(r, mn, a));
1432:     }
1433:     PetscCall(MatDenseCUDARestoreArrayWrite(A, &a));
1434:   } else PetscCall(MatSetRandom_SeqDense(A, r));
1435:   PetscFunctionReturn(PETSC_SUCCESS);
1436: }

1438: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A, PetscBool flg)
1439: {
1440:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1442:   PetscFunctionBegin;
1443:   PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1444:   PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1445:   A->boundtocpu = flg;
1446:   if (!flg) {
1447:     PetscBool iscuda;

1449:     PetscCall(PetscFree(A->defaultrandtype));
1450:     PetscCall(PetscStrallocpy(PETSCCURAND, &A->defaultrandtype));
1451:     PetscCall(PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQCUDA, &iscuda));
1452:     if (!iscuda) PetscCall(VecDestroy(&a->cvec));
1453:     PetscCall(PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSECUDA, &iscuda));
1454:     if (!iscuda) PetscCall(MatDestroy(&a->cmat));
1455:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA));
1456:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA));
1457:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseCUDA));
1458:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseCUDA));
1459:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseCUDA));
1460:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseCUDA));
1461:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseCUDA));
1462:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseCUDA));
1463:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseCUDA));
1464:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseCUDA));
1465:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseCUDA));
1466:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseCUDA));
1467:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseCUDA));

1469:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", MatDenseGetArrayAndMemType_SeqDenseCUDA));
1470:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", MatDenseRestoreArrayAndMemType_SeqDenseCUDA));
1471:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", MatDenseGetArrayReadAndMemType_SeqDenseCUDA));
1472:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", MatDenseRestoreArrayReadAndMemType_SeqDenseCUDA));
1473:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", MatDenseGetArrayWriteAndMemType_SeqDenseCUDA));
1474:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", MatDenseRestoreArrayWriteAndMemType_SeqDenseCUDA));

1476:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1477:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1478:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1479:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1480:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1481:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1482:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1483:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1484:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1485:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1486:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1487:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1488:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1489:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1490:     A->ops->shift                   = MatShift_SeqDenseCUDA;
1491:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1492:     A->ops->zeroentries             = MatZeroEntries_SeqDenseCUDA;
1493:     A->ops->setup                   = MatSetUp_SeqDenseCUDA;
1494:     A->ops->setrandom               = MatSetRandom_SeqDenseCUDA;
1495:   } else {
1496:     /* make sure we have an up-to-date copy on the CPU */
1497:     PetscCall(MatSeqDenseCUDACopyFromGPU(A));
1498:     PetscCall(PetscFree(A->defaultrandtype));
1499:     PetscCall(PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype));
1500:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense));
1501:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense));
1502:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense));
1503:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense));
1504:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense));
1505:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense));
1506:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense));
1507:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense));
1508:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense));
1509:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense));
1510:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense));
1511:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1512:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense));

1514:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", NULL));
1515:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", NULL));
1516:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", NULL));
1517:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", NULL));
1518:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", NULL));
1519:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", NULL));

1521:     A->ops->duplicate               = MatDuplicate_SeqDense;
1522:     A->ops->mult                    = MatMult_SeqDense;
1523:     A->ops->multadd                 = MatMultAdd_SeqDense;
1524:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1525:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1526:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1527:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1528:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1529:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1530:     A->ops->axpy                    = MatAXPY_SeqDense;
1531:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1532:     A->ops->lufactor                = MatLUFactor_SeqDense;
1533:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1534:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1535:     A->ops->scale                   = MatScale_SeqDense;
1536:     A->ops->shift                   = MatShift_SeqDense;
1537:     A->ops->copy                    = MatCopy_SeqDense;
1538:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1539:     A->ops->setup                   = MatSetUp_SeqDense;
1540:     A->ops->setrandom               = MatSetRandom_SeqDense;
1541:   }
1542:   if (a->cmat) PetscCall(MatBindToCPU(a->cmat, flg));
1543:   PetscFunctionReturn(PETSC_SUCCESS);
1544: }

1546: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1547: {
1548:   Mat           B;
1549:   Mat_SeqDense *a;

1551:   PetscFunctionBegin;
1552:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1553:     /* TODO these cases should be optimized */
1554:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1555:     PetscFunctionReturn(PETSC_SUCCESS);
1556:   }

1558:   B = *newmat;
1559:   PetscCall(MatBindToCPU_SeqDenseCUDA(B, PETSC_TRUE));
1560:   PetscCall(MatReset_SeqDenseCUDA(B));
1561:   PetscCall(PetscFree(B->defaultvectype));
1562:   PetscCall(PetscStrallocpy(VECSTANDARD, &B->defaultvectype));
1563:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE));
1564:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", NULL));
1565:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", NULL));
1566:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", NULL));
1567:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", NULL));
1568:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", NULL));
1569:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", NULL));
1570:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", NULL));
1571:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", NULL));
1572:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", NULL));
1573:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", NULL));
1574:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", NULL));
1575:   a = (Mat_SeqDense *)B->data;
1576:   PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1577:   B->ops->bindtocpu = NULL;
1578:   B->ops->destroy   = MatDestroy_SeqDense;
1579:   B->offloadmask    = PETSC_OFFLOAD_CPU;
1580:   PetscFunctionReturn(PETSC_SUCCESS);
1581: }

1583: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1584: {
1585:   Mat_SeqDenseCUDA *dB;
1586:   Mat               B;
1587:   Mat_SeqDense     *a;

1589:   PetscFunctionBegin;
1590:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1591:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1592:     /* TODO these cases should be optimized */
1593:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1594:     PetscFunctionReturn(PETSC_SUCCESS);
1595:   }

1597:   B = *newmat;
1598:   PetscCall(PetscFree(B->defaultvectype));
1599:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
1600:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSECUDA));
1601:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense));
1602:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA));
1603:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA));
1604:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA));
1605:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA));
1606:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA));
1607:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA));
1608:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA));
1609:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA));
1610:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA));
1611:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
1612:   a = (Mat_SeqDense *)B->data;
1613:   PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1614:   PetscCall(PetscNew(&dB));

1616:   B->spptr       = dB;
1617:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1619:   PetscCall(MatBindToCPU_SeqDenseCUDA(B, PETSC_FALSE));
1620:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1621:   B->ops->destroy   = MatDestroy_SeqDenseCUDA;
1622:   PetscFunctionReturn(PETSC_SUCCESS);
1623: }

1625: /*@C
1626:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1628:    Collective

1630:    Input Parameters:
1631: +  comm - MPI communicator
1632: .  m - number of rows
1633: .  n - number of columns
1634: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1635:    to control matrix memory allocation.

1637:    Output Parameter:
1638: .  A - the matrix

1640:    Level: intermediate

1642: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1643: @*/
1644: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1645: {
1646:   PetscMPIInt size;

1648:   PetscFunctionBegin;
1649:   PetscCallMPI(MPI_Comm_size(comm, &size));
1650:   PetscCheck(size <= 1, comm, PETSC_ERR_ARG_WRONG, "Invalid communicator size %d", size);
1651:   PetscCall(MatCreate(comm, A));
1652:   PetscCall(MatSetSizes(*A, m, n, m, n));
1653:   PetscCall(MatSetType(*A, MATSEQDENSECUDA));
1654:   PetscCall(MatSeqDenseCUDASetPreallocation(*A, data));
1655:   PetscFunctionReturn(PETSC_SUCCESS);
1656: }

1658: /*MC
1659:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1661:    Options Database Keys:
1662: . -mat_type seqdensecuda - sets the matrix type to `MATSEQDENSECUDA` during a call to MatSetFromOptions()

1664:   Level: beginner

1666: .seealso: `MATSEQDENSE`
1667: M*/
1668: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1669: {
1670:   PetscFunctionBegin;
1671:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1672:   PetscCall(MatCreate_SeqDense(B));
1673:   PetscCall(MatConvert_SeqDense_SeqDenseCUDA(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1674:   PetscFunctionReturn(PETSC_SUCCESS);
1675: }