Actual source code: densecuda.cu
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
8: #include <petsc/private/petsclegacycupmblas.h>
9: #include <thrust/device_ptr.h>
10: #include <thrust/functional.h>
11: #include <thrust/iterator/counting_iterator.h>
12: #include <thrust/iterator/transform_iterator.h>
13: #include <thrust/iterator/permutation_iterator.h>
14: #include <thrust/transform.h>
15: #include <thrust/device_vector.h>
17: using VecSeq_CUDA = Petsc::vec::cupm::impl::VecSeq_CUPM<Petsc::device::cupm::DeviceType::CUDA>;
19: typedef struct {
20: PetscScalar *d_v; /* pointer to the matrix on the GPU */
21: PetscBool user_alloc;
22: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
23: PetscBool unplaced_user_alloc;
24: /* factorization support */
25: PetscCuBLASInt *d_fact_ipiv; /* device pivots */
26: PetscScalar *d_fact_tau; /* device QR tau vector */
27: PetscScalar *d_fact_work; /* device workspace */
28: PetscCuBLASInt fact_lwork;
29: PetscCuBLASInt *d_fact_info; /* device info */
30: /* workspace */
31: Vec workvec;
32: } Mat_SeqDenseCUDA;
34: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
35: {
36: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
37: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
38: PetscBool iscuda;
40: PetscFunctionBegin;
41: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &iscuda));
42: if (!iscuda) PetscFunctionReturn(PETSC_SUCCESS);
43: PetscCall(PetscLayoutSetUp(A->rmap));
44: PetscCall(PetscLayoutSetUp(A->cmap));
45: /* it may happen CPU preallocation has not been performed */
46: if (cA->lda <= 0) cA->lda = A->rmap->n;
47: if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
48: if (!d_data) { /* petsc-allocated storage */
49: size_t sz;
51: PetscCall(PetscIntMultError(cA->lda, A->cmap->n, NULL));
52: sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
53: PetscCallCUDA(cudaMalloc((void **)&dA->d_v, sz));
54: PetscCallCUDA(cudaMemset(dA->d_v, 0, sz));
55: dA->user_alloc = PETSC_FALSE;
56: } else { /* user-allocated storage */
57: dA->d_v = d_data;
58: dA->user_alloc = PETSC_TRUE;
59: }
60: A->offloadmask = PETSC_OFFLOAD_GPU;
61: A->preallocated = PETSC_TRUE;
62: A->assembled = PETSC_TRUE;
63: PetscFunctionReturn(PETSC_SUCCESS);
64: }
66: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
67: {
68: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
69: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
71: PetscFunctionBegin;
72: PetscCheckTypeName(A, MATSEQDENSECUDA);
73: PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
74: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
75: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
76: PetscCall(MatSeqDenseSetPreallocation(A, NULL));
77: }
78: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0));
79: if (cA->lda > A->rmap->n) {
80: PetscCallCUDA(cudaMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyDeviceToHost));
81: } else {
82: PetscCallCUDA(cudaMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyDeviceToHost));
83: }
84: PetscCall(PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
85: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0));
87: A->offloadmask = PETSC_OFFLOAD_BOTH;
88: }
89: PetscFunctionReturn(PETSC_SUCCESS);
90: }
92: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
93: {
94: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
95: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
96: PetscBool copy;
98: PetscFunctionBegin;
99: PetscCheckTypeName(A, MATSEQDENSECUDA);
100: if (A->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
101: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
102: PetscCall(PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n));
103: if (copy) {
104: if (!dA->d_v) { /* Allocate GPU memory if not present */
105: PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
106: }
107: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0));
108: if (cA->lda > A->rmap->n) {
109: PetscCallCUDA(cudaMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyHostToDevice));
110: } else {
111: PetscCallCUDA(cudaMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyHostToDevice));
112: }
113: PetscCall(PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n));
114: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0));
116: A->offloadmask = PETSC_OFFLOAD_BOTH;
117: }
118: PetscFunctionReturn(PETSC_SUCCESS);
119: }
121: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A, Mat B, MatStructure str)
122: {
123: const PetscScalar *va;
124: PetscScalar *vb;
125: PetscInt lda1, lda2, m = A->rmap->n, n = A->cmap->n;
127: PetscFunctionBegin;
128: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
129: if (A->ops->copy != B->ops->copy) {
130: PetscCall(MatCopy_Basic(A, B, str));
131: PetscFunctionReturn(PETSC_SUCCESS);
132: }
133: PetscCheck(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
134: PetscCall(MatDenseCUDAGetArrayRead(A, &va));
135: PetscCall(MatDenseCUDAGetArrayWrite(B, &vb));
136: PetscCall(MatDenseGetLDA(A, &lda1));
137: PetscCall(MatDenseGetLDA(B, &lda2));
138: PetscCall(PetscLogGpuTimeBegin());
139: if (lda1 > m || lda2 > m) {
140: PetscCallCUDA(cudaMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, cudaMemcpyDeviceToDevice));
141: } else {
142: PetscCallCUDA(cudaMemcpy(vb, va, m * (n * sizeof(PetscScalar)), cudaMemcpyDeviceToDevice));
143: }
144: PetscCall(PetscLogGpuTimeEnd());
145: PetscCall(MatDenseCUDARestoreArrayWrite(B, &vb));
146: PetscCall(MatDenseCUDARestoreArrayRead(A, &va));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
151: {
152: PetscScalar *va;
153: PetscInt lda, m = A->rmap->n, n = A->cmap->n;
155: PetscFunctionBegin;
156: PetscCall(MatDenseCUDAGetArrayWrite(A, &va));
157: PetscCall(MatDenseGetLDA(A, &lda));
158: PetscCall(PetscLogGpuTimeBegin());
159: if (lda > m) {
160: PetscCallCUDA(cudaMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n));
161: } else {
162: PetscCallCUDA(cudaMemset(va, 0, m * (n * sizeof(PetscScalar))));
163: }
164: PetscCall(PetscLogGpuTimeEnd());
165: PetscCall(MatDenseCUDARestoreArrayWrite(A, &va));
166: PetscFunctionReturn(PETSC_SUCCESS);
167: }
169: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
170: {
171: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
172: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
174: PetscFunctionBegin;
175: PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
176: PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
177: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
178: if (aa->v) PetscCall(MatSeqDenseCUDACopyToGPU(A));
179: dA->unplacedarray = dA->d_v;
180: dA->unplaced_user_alloc = dA->user_alloc;
181: dA->d_v = (PetscScalar *)a;
182: dA->user_alloc = PETSC_TRUE;
183: PetscFunctionReturn(PETSC_SUCCESS);
184: }
186: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
187: {
188: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
189: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
191: PetscFunctionBegin;
192: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
193: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
194: if (a->v) PetscCall(MatSeqDenseCUDACopyToGPU(A));
195: dA->d_v = dA->unplacedarray;
196: dA->user_alloc = dA->unplaced_user_alloc;
197: dA->unplacedarray = NULL;
198: PetscFunctionReturn(PETSC_SUCCESS);
199: }
201: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
202: {
203: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
204: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
206: PetscFunctionBegin;
207: PetscCheck(!aa->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
208: PetscCheck(!aa->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
209: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
210: if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
211: dA->d_v = (PetscScalar *)a;
212: dA->user_alloc = PETSC_FALSE;
213: PetscFunctionReturn(PETSC_SUCCESS);
214: }
216: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
217: {
218: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
220: PetscFunctionBegin;
221: if (!dA->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
222: *a = dA->d_v;
223: PetscFunctionReturn(PETSC_SUCCESS);
224: }
226: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat, PetscScalar **a)
227: {
228: PetscFunctionBegin;
229: if (a) *a = NULL;
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
234: {
235: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
237: PetscFunctionBegin;
238: PetscCall(MatSeqDenseCUDACopyToGPU(A));
239: *a = dA->d_v;
240: PetscFunctionReturn(PETSC_SUCCESS);
241: }
243: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat, const PetscScalar **a)
244: {
245: PetscFunctionBegin;
246: if (a) *a = NULL;
247: PetscFunctionReturn(PETSC_SUCCESS);
248: }
250: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
251: {
252: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
254: PetscFunctionBegin;
255: PetscCall(MatSeqDenseCUDACopyToGPU(A));
256: *a = dA->d_v;
257: PetscFunctionReturn(PETSC_SUCCESS);
258: }
260: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat, PetscScalar **a)
261: {
262: PetscFunctionBegin;
263: if (a) *a = NULL;
264: PetscFunctionReturn(PETSC_SUCCESS);
265: }
267: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
268: {
269: #if PETSC_PKG_CUDA_VERSION_GE(10, 1, 0)
270: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
271: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
272: PetscScalar *da;
273: cusolverDnHandle_t handle;
274: PetscCuBLASInt n, lda;
275: #if defined(PETSC_USE_DEBUG)
276: PetscCuBLASInt info;
277: #endif
279: PetscFunctionBegin;
280: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
281: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
282: PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
283: PetscCall(PetscCuBLASIntCast(a->lda, &lda));
284: PetscCheck(A->factortype != MAT_FACTOR_LU, PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDngetri not implemented");
285: if (A->factortype == MAT_FACTOR_CHOLESKY) {
286: if (!dA->d_fact_ipiv) { /* spd */
287: PetscCuBLASInt il;
289: PetscCall(MatDenseCUDAGetArray(A, &da));
290: PetscCallCUSOLVER(cusolverDnXpotri_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &il));
291: if (il > dA->fact_lwork) {
292: dA->fact_lwork = il;
294: PetscCallCUDA(cudaFree(dA->d_fact_work));
295: PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
296: }
297: PetscCall(PetscLogGpuTimeBegin());
298: PetscCallCUSOLVER(cusolverDnXpotri(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
299: PetscCall(PetscLogGpuTimeEnd());
300: PetscCall(MatDenseCUDARestoreArray(A, &da));
301: /* TODO (write cuda kernel) */
302: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
303: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytri not implemented");
304: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
305: #if defined(PETSC_USE_DEBUG)
306: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
307: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: leading minor of order %d is zero", info);
308: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
309: #endif
310: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
311: A->ops->solve = NULL;
312: A->ops->solvetranspose = NULL;
313: A->ops->matsolve = NULL;
314: A->factortype = MAT_FACTOR_NONE;
316: PetscCall(PetscFree(A->solvertype));
317: PetscFunctionReturn(PETSC_SUCCESS);
318: #else
319: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
320: #endif
321: }
323: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
324: {
325: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
326: PetscScalar *y;
327: PetscCuBLASInt m = 0, k = 0;
328: PetscBool xiscuda, yiscuda, aiscuda;
330: PetscFunctionBegin;
331: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
332: PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
333: PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
334: PetscCall(PetscObjectTypeCompare((PetscObject)xx, VECSEQCUDA, &xiscuda));
335: PetscCall(PetscObjectTypeCompare((PetscObject)yy, VECSEQCUDA, &yiscuda));
336: {
337: const PetscScalar *x;
338: PetscBool xishost = PETSC_TRUE;
340: /* The logic here is to try to minimize the amount of memory copying:
341: if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
342: data is not offloaded to the GPU yet, then the data is copied to the
343: GPU. But we are only trying to get the data in order to copy it into the y
344: array. So the array x will be wherever the data already is so that
345: only one memcpy is performed */
346: if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
347: PetscCall(VecCUDAGetArrayRead(xx, &x));
348: xishost = PETSC_FALSE;
349: } else {
350: PetscCall(VecGetArrayRead(xx, &x));
351: }
352: if (k < m || !yiscuda) {
353: if (!dA->workvec) PetscCall(VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
354: PetscCall(VecCUDAGetArrayWrite(dA->workvec, &y));
355: } else {
356: PetscCall(VecCUDAGetArrayWrite(yy, &y));
357: }
358: PetscCallCUDA(cudaMemcpy(y, x, m * sizeof(PetscScalar), xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice));
359: }
360: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda));
361: if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
362: PetscCall((*matsolve)(A, y, m, m, 1, k, transpose));
363: if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
364: if (k < m || !yiscuda) {
365: PetscScalar *yv;
367: /* The logic here is that the data is not yet in either yy's GPU array or its
368: CPU array. There is nothing in the interface to say where the user would like
369: it to end up. So we choose the GPU, because it is the faster option */
370: if (yiscuda) {
371: PetscCall(VecCUDAGetArrayWrite(yy, &yv));
372: } else {
373: PetscCall(VecGetArray(yy, &yv));
374: }
375: PetscCallCUDA(cudaMemcpy(yv, y, k * sizeof(PetscScalar), yiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
376: if (yiscuda) {
377: PetscCall(VecCUDARestoreArrayWrite(yy, &yv));
378: } else {
379: PetscCall(VecRestoreArray(yy, &yv));
380: }
381: PetscCall(VecCUDARestoreArrayWrite(dA->workvec, &y));
382: } else {
383: PetscCall(VecCUDARestoreArrayWrite(yy, &y));
384: }
385: PetscFunctionReturn(PETSC_SUCCESS);
386: }
388: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
389: {
390: PetscScalar *y;
391: PetscInt n, _ldb, _ldx;
392: PetscBool biscuda, xiscuda, aiscuda;
393: PetscCuBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;
395: PetscFunctionBegin;
396: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
397: PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
398: PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
399: PetscCall(MatGetSize(B, NULL, &n));
400: PetscCall(PetscCuBLASIntCast(n, &nrhs));
401: PetscCall(MatDenseGetLDA(B, &_ldb));
402: PetscCall(PetscCuBLASIntCast(_ldb, &ldb));
403: PetscCall(MatDenseGetLDA(X, &_ldx));
404: PetscCall(PetscCuBLASIntCast(_ldx, &ldx));
406: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
407: PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSECUDA, &xiscuda));
408: {
409: /* The logic here is to try to minimize the amount of memory copying:
410: if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
411: data is not offloaded to the GPU yet, then the data is copied to the
412: GPU. But we are only trying to get the data in order to copy it into the y
413: array. So the array b will be wherever the data already is so that
414: only one memcpy is performed */
415: const PetscScalar *b;
417: /* some copying from B will be involved */
418: PetscBool bishost = PETSC_TRUE;
420: if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
421: PetscCall(MatDenseCUDAGetArrayRead(B, &b));
422: bishost = PETSC_FALSE;
423: } else {
424: PetscCall(MatDenseGetArrayRead(B, &b));
425: }
426: if (ldx < m || !xiscuda) {
427: /* X's array cannot serve as the array (too small or not on device), B's
428: * array cannot serve as the array (const), so allocate a new array */
429: ldy = m;
430: PetscCallCUDA(cudaMalloc((void **)&y, nrhs * m * sizeof(PetscScalar)));
431: } else {
432: /* X's array should serve as the array */
433: ldy = ldx;
434: PetscCall(MatDenseCUDAGetArrayWrite(X, &y));
435: }
436: PetscCallCUDA(cudaMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice));
437: if (bishost) {
438: PetscCall(MatDenseRestoreArrayRead(B, &b));
439: } else {
440: PetscCall(MatDenseCUDARestoreArrayRead(B, &b));
441: }
442: }
443: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda));
444: if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
445: PetscCall((*matsolve)(A, y, ldy, m, nrhs, k, transpose));
446: if (!aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
447: if (ldx < m || !xiscuda) {
448: PetscScalar *x;
450: /* The logic here is that the data is not yet in either X's GPU array or its
451: CPU array. There is nothing in the interface to say where the user would like
452: it to end up. So we choose the GPU, because it is the faster option */
453: if (xiscuda) {
454: PetscCall(MatDenseCUDAGetArrayWrite(X, &x));
455: } else {
456: PetscCall(MatDenseGetArray(X, &x));
457: }
458: PetscCallCUDA(cudaMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
459: if (xiscuda) {
460: PetscCall(MatDenseCUDARestoreArrayWrite(X, &x));
461: } else {
462: PetscCall(MatDenseRestoreArray(X, &x));
463: }
464: PetscCallCUDA(cudaFree(y));
465: } else {
466: PetscCall(MatDenseCUDARestoreArrayWrite(X, &y));
467: }
468: PetscFunctionReturn(PETSC_SUCCESS);
469: }
471: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
472: {
473: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
474: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
475: const PetscScalar *da;
476: PetscCuBLASInt lda;
477: cusolverDnHandle_t handle;
478: int info;
480: PetscFunctionBegin;
481: PetscCall(MatDenseCUDAGetArrayRead(A, &da));
482: PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
483: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
484: PetscCall(PetscLogGpuTimeBegin());
485: PetscCall(PetscInfo(A, "LU solve %d x %d on backend\n", m, k));
486: PetscCallCUSOLVER(cusolverDnXgetrs(handle, T ? CUBLAS_OP_T : CUBLAS_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_info));
487: PetscCall(PetscLogGpuTimeEnd());
488: PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
489: if (PetscDefined(USE_DEBUG)) {
490: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
491: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
492: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
493: }
494: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
495: PetscFunctionReturn(PETSC_SUCCESS);
496: }
498: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool)
499: {
500: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
501: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
502: const PetscScalar *da;
503: PetscCuBLASInt lda;
504: cusolverDnHandle_t handle;
505: int info;
507: PetscFunctionBegin;
508: PetscCall(MatDenseCUDAGetArrayRead(A, &da));
509: PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
510: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
511: PetscCall(PetscLogGpuTimeBegin());
512: PetscCall(PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k));
513: if (!dA->d_fact_ipiv) { /* spd */
514: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
515: PetscCallCUSOLVER(cusolverDnXpotrs(handle, CUBLAS_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_info));
516: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytrs not implemented");
517: PetscCall(PetscLogGpuTimeEnd());
518: PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
519: if (PetscDefined(USE_DEBUG)) {
520: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
521: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
522: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
523: }
524: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
525: PetscFunctionReturn(PETSC_SUCCESS);
526: }
528: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
529: {
530: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
531: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
532: const PetscScalar *da;
533: PetscCuBLASInt lda, rank;
534: cusolverDnHandle_t handle;
535: cublasHandle_t bhandle;
536: int info;
537: cublasOperation_t trans;
538: PetscScalar one = 1.;
540: PetscFunctionBegin;
541: PetscCall(PetscCuBLASIntCast(mat->rank, &rank));
542: PetscCall(MatDenseCUDAGetArrayRead(A, &da));
543: PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
544: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
545: PetscCall(PetscCUBLASGetHandle(&bhandle));
546: PetscCall(PetscLogGpuTimeBegin());
547: PetscCall(PetscInfo(A, "QR solve %d x %d on backend\n", m, k));
548: if (!T) {
549: if (PetscDefined(USE_COMPLEX)) {
550: trans = CUBLAS_OP_C;
551: } else {
552: trans = CUBLAS_OP_T;
553: }
554: PetscCallCUSOLVER(cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
555: if (PetscDefined(USE_DEBUG)) {
556: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
557: PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
558: }
559: PetscCallCUBLAS(cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
560: } else {
561: PetscCallCUBLAS(cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx));
562: PetscCallCUSOLVER(cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
563: if (PetscDefined(USE_DEBUG)) {
564: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
565: PetscCheck(info == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
566: }
567: }
568: PetscCall(PetscLogGpuTimeEnd());
569: PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
570: PetscCall(PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank))));
571: PetscFunctionReturn(PETSC_SUCCESS);
572: }
574: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
575: {
576: PetscFunctionBegin;
577: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU));
578: PetscFunctionReturn(PETSC_SUCCESS);
579: }
581: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
582: {
583: PetscFunctionBegin;
584: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU));
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
589: {
590: PetscFunctionBegin;
591: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
592: PetscFunctionReturn(PETSC_SUCCESS);
593: }
595: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
596: {
597: PetscFunctionBegin;
598: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
599: PetscFunctionReturn(PETSC_SUCCESS);
600: }
602: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
603: {
604: PetscFunctionBegin;
605: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR));
606: PetscFunctionReturn(PETSC_SUCCESS);
607: }
609: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
610: {
611: PetscFunctionBegin;
612: PetscCall(MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR));
613: PetscFunctionReturn(PETSC_SUCCESS);
614: }
616: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
617: {
618: PetscFunctionBegin;
619: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU));
620: PetscFunctionReturn(PETSC_SUCCESS);
621: }
623: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
624: {
625: PetscFunctionBegin;
626: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU));
627: PetscFunctionReturn(PETSC_SUCCESS);
628: }
630: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
631: {
632: PetscFunctionBegin;
633: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
634: PetscFunctionReturn(PETSC_SUCCESS);
635: }
637: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
638: {
639: PetscFunctionBegin;
640: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky));
641: PetscFunctionReturn(PETSC_SUCCESS);
642: }
644: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
645: {
646: PetscFunctionBegin;
647: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR));
648: PetscFunctionReturn(PETSC_SUCCESS);
649: }
651: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
652: {
653: PetscFunctionBegin;
654: PetscCall(MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR));
655: PetscFunctionReturn(PETSC_SUCCESS);
656: }
658: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A, IS, IS, const MatFactorInfo *)
659: {
660: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
661: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
662: PetscScalar *da;
663: PetscCuBLASInt m, n, lda;
664: #if defined(PETSC_USE_DEBUG)
665: int info;
666: #endif
667: cusolverDnHandle_t handle;
669: PetscFunctionBegin;
670: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
671: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
672: PetscCall(MatDenseGetArrayAndMemType(A, &da, nullptr));
673: PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
674: PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
675: PetscCall(PetscCuBLASIntCast(a->lda, &lda));
676: PetscCall(PetscInfo(A, "LU factor %d x %d on backend\n", m, n));
677: if (!dA->d_fact_ipiv) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
678: if (!dA->fact_lwork) {
679: PetscCallCUSOLVER(cusolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
680: PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
681: }
682: if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
683: PetscCall(PetscLogGpuTimeBegin());
684: PetscCallCUSOLVER(cusolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->d_fact_ipiv, dA->d_fact_info));
685: PetscCall(PetscLogGpuTimeEnd());
686: PetscCall(MatDenseRestoreArrayAndMemType(A, &da));
687: #if defined(PETSC_USE_DEBUG)
688: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
689: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
690: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
691: #endif
692: A->factortype = MAT_FACTOR_LU;
693: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
695: A->ops->solve = MatSolve_SeqDenseCUDA_LU;
696: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_LU;
697: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_LU;
698: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;
700: PetscCall(PetscFree(A->solvertype));
701: PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
702: PetscFunctionReturn(PETSC_SUCCESS);
703: }
705: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A, IS, const MatFactorInfo *)
706: {
707: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
708: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
709: PetscScalar *da;
710: PetscCuBLASInt n, lda;
711: #if defined(PETSC_USE_DEBUG)
712: int info;
713: #endif
714: cusolverDnHandle_t handle;
716: PetscFunctionBegin;
717: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
718: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
719: PetscCall(PetscCuBLASIntCast(A->rmap->n, &n));
720: PetscCall(PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n));
721: if (A->spd == PETSC_BOOL3_TRUE) {
722: PetscCall(MatDenseCUDAGetArray(A, &da));
723: PetscCall(PetscCuBLASIntCast(a->lda, &lda));
724: if (!dA->fact_lwork) {
725: PetscCallCUSOLVER(cusolverDnXpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork));
726: PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
727: }
728: if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
729: PetscCall(PetscLogGpuTimeBegin());
730: PetscCallCUSOLVER(cusolverDnXpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
731: PetscCall(PetscLogGpuTimeEnd());
733: PetscCall(MatDenseCUDARestoreArray(A, &da));
734: #if defined(PETSC_USE_DEBUG)
735: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
736: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
737: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
738: #endif
739: A->factortype = MAT_FACTOR_CHOLESKY;
740: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
741: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
742: #if 0
743: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
744: The code below should work, and it can be activated when *sytrs routines will be available */
745: if (!dA->d_fact_ipiv) {
746: PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv)));
747: }
748: if (!dA->fact_lwork) {
749: PetscCallCUSOLVER(cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork));
750: PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work)));
751: }
752: if (!dA->d_fact_info) {
753: PetscCallCUDA(cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info)));
754: }
755: PetscCall(PetscLogGpuTimeBegin());
756: PetscCallCUSOLVER(cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info));
757: PetscCall(PetscLogGpuTimeEnd());
758: #endif
760: A->ops->solve = MatSolve_SeqDenseCUDA_Cholesky;
761: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_Cholesky;
762: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_Cholesky;
763: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
764: PetscCall(PetscFree(A->solvertype));
765: PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
766: PetscFunctionReturn(PETSC_SUCCESS);
767: }
769: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A, IS, const MatFactorInfo *)
770: {
771: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
772: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
773: PetscScalar *da;
774: PetscCuBLASInt m, min, max, n, lda;
775: #if defined(PETSC_USE_DEBUG)
776: int info;
777: #endif
778: cusolverDnHandle_t handle;
780: PetscFunctionBegin;
781: if (!A->rmap->n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
782: PetscCall(PetscCUSOLVERDnGetHandle(&handle));
783: PetscCall(MatDenseCUDAGetArray(A, &da));
784: PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
785: PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
786: PetscCall(PetscCuBLASIntCast(a->lda, &lda));
787: PetscCall(PetscInfo(A, "QR factor %d x %d on backend\n", m, n));
788: max = PetscMax(m, n);
789: min = PetscMin(m, n);
790: if (!dA->d_fact_tau) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau)));
791: if (!dA->d_fact_ipiv) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv)));
792: if (!dA->fact_lwork) {
793: PetscCallCUSOLVER(cusolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork));
794: PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work)));
795: }
796: if (!dA->d_fact_info) PetscCallCUDA(cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info)));
797: if (!dA->workvec) PetscCall(VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec)));
798: PetscCall(PetscLogGpuTimeBegin());
799: PetscCallCUSOLVER(cusolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info));
800: PetscCall(PetscLogGpuTimeEnd());
801: PetscCall(MatDenseCUDARestoreArray(A, &da));
802: #if defined(PETSC_USE_DEBUG)
803: PetscCallCUDA(cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost));
804: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cuSolver %d", -info);
805: #endif
806: A->factortype = MAT_FACTOR_QR;
807: a->rank = min;
808: PetscCall(PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0)));
810: A->ops->solve = MatSolve_SeqDenseCUDA_QR;
811: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_QR;
812: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_QR;
813: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;
815: PetscCall(PetscFree(A->solvertype));
816: PetscCall(PetscStrallocpy(MATSOLVERCUDA, &A->solvertype));
817: PetscFunctionReturn(PETSC_SUCCESS);
818: }
820: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
821: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
822: {
823: const PetscScalar *da, *db;
824: PetscScalar *dc;
825: PetscScalar one = 1.0, zero = 0.0;
826: PetscCuBLASInt m, n, k;
827: PetscInt alda, blda, clda;
828: cublasHandle_t cublasv2handle;
829: PetscBool Aiscuda, Biscuda;
831: PetscFunctionBegin;
832: /* we may end up with SEQDENSE as one of the arguments */
833: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &Aiscuda));
834: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &Biscuda));
835: if (!Aiscuda) PetscCall(MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A));
836: if (!Biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
837: PetscCall(PetscCuBLASIntCast(C->rmap->n, &m));
838: PetscCall(PetscCuBLASIntCast(C->cmap->n, &n));
839: if (tA) PetscCall(PetscCuBLASIntCast(A->rmap->n, &k));
840: else PetscCall(PetscCuBLASIntCast(A->cmap->n, &k));
841: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
842: PetscCall(PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n));
843: PetscCall(MatDenseCUDAGetArrayRead(A, &da));
844: PetscCall(MatDenseCUDAGetArrayRead(B, &db));
845: PetscCall(MatDenseCUDAGetArrayWrite(C, &dc));
846: PetscCall(MatDenseGetLDA(A, &alda));
847: PetscCall(MatDenseGetLDA(B, &blda));
848: PetscCall(MatDenseGetLDA(C, &clda));
849: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
850: PetscCall(PetscLogGpuTimeBegin());
851: PetscCallCUBLAS(cublasXgemm(cublasv2handle, tA ? CUBLAS_OP_T : CUBLAS_OP_N, tB ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda));
852: PetscCall(PetscLogGpuTimeEnd());
853: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
854: PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
855: PetscCall(MatDenseCUDARestoreArrayRead(B, &db));
856: PetscCall(MatDenseCUDARestoreArrayWrite(C, &dc));
857: if (!Aiscuda) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
858: if (!Biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
859: PetscFunctionReturn(PETSC_SUCCESS);
860: }
862: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
863: {
864: PetscFunctionBegin;
865: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_TRUE, PETSC_FALSE));
866: PetscFunctionReturn(PETSC_SUCCESS);
867: }
869: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
870: {
871: PetscFunctionBegin;
872: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_FALSE));
873: PetscFunctionReturn(PETSC_SUCCESS);
874: }
876: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
877: {
878: PetscFunctionBegin;
879: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_TRUE));
880: PetscFunctionReturn(PETSC_SUCCESS);
881: }
883: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
884: {
885: PetscFunctionBegin;
886: PetscCall(MatProductSetFromOptions_SeqDense(C));
887: PetscFunctionReturn(PETSC_SUCCESS);
888: }
890: /* zz = op(A)*xx + yy
891: if yy == NULL, only MatMult */
892: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
893: {
894: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
895: const PetscScalar *xarray, *da;
896: PetscScalar *zarray;
897: PetscScalar one = 1.0, zero = 0.0;
898: PetscCuBLASInt m, n, lda;
899: cublasHandle_t cublasv2handle;
901: PetscFunctionBegin;
902: if (yy && yy != zz) PetscCall(VecSeq_CUDA::copy(yy, zz)); /* mult add */
903: if (!A->rmap->n || !A->cmap->n) {
904: /* mult only */
905: if (!yy) PetscCall(VecSeq_CUDA::set(zz, 0.0));
906: PetscFunctionReturn(PETSC_SUCCESS);
907: }
908: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n));
909: PetscCall(PetscCuBLASIntCast(A->rmap->n, &m));
910: PetscCall(PetscCuBLASIntCast(A->cmap->n, &n));
911: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
912: PetscCall(MatDenseCUDAGetArrayRead(A, &da));
913: PetscCall(PetscCuBLASIntCast(mat->lda, &lda));
914: PetscCall(VecCUDAGetArrayRead(xx, &xarray));
915: PetscCall(VecCUDAGetArray(zz, &zarray));
916: PetscCall(PetscLogGpuTimeBegin());
917: PetscCallCUBLAS(cublasXgemv(cublasv2handle, trans ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1));
918: PetscCall(PetscLogGpuTimeEnd());
919: PetscCall(PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n)));
920: PetscCall(VecCUDARestoreArrayRead(xx, &xarray));
921: PetscCall(VecCUDARestoreArray(zz, &zarray));
922: PetscCall(MatDenseCUDARestoreArrayRead(A, &da));
923: PetscFunctionReturn(PETSC_SUCCESS);
924: }
926: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
927: {
928: PetscFunctionBegin;
929: PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_FALSE));
930: PetscFunctionReturn(PETSC_SUCCESS);
931: }
933: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
934: {
935: PetscFunctionBegin;
936: PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_TRUE));
937: PetscFunctionReturn(PETSC_SUCCESS);
938: }
940: PetscErrorCode MatMult_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
941: {
942: PetscFunctionBegin;
943: PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_FALSE));
944: PetscFunctionReturn(PETSC_SUCCESS);
945: }
947: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
948: {
949: PetscFunctionBegin;
950: PetscCall(MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_TRUE));
951: PetscFunctionReturn(PETSC_SUCCESS);
952: }
954: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **array)
955: {
956: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
958: PetscFunctionBegin;
959: PetscCall(MatSeqDenseCUDACopyFromGPU(A));
960: *array = mat->v;
961: PetscFunctionReturn(PETSC_SUCCESS);
962: }
964: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **array)
965: {
966: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
968: PetscFunctionBegin;
969: /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
970: if (!mat->v) PetscCall(MatSeqDenseSetPreallocation(A, NULL));
971: *array = mat->v;
972: A->offloadmask = PETSC_OFFLOAD_CPU;
973: PetscFunctionReturn(PETSC_SUCCESS);
974: }
976: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A, PetscScalar **array)
977: {
978: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
980: PetscFunctionBegin;
981: PetscCall(MatSeqDenseCUDACopyFromGPU(A));
982: *array = mat->v;
983: A->offloadmask = PETSC_OFFLOAD_CPU;
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: static PetscErrorCode MatDenseGetArrayAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array, PetscMemType *mtype)
988: {
989: const auto dA = static_cast<Mat_SeqDenseCUDA *>(A->spptr);
991: PetscFunctionBegin;
992: PetscCall(MatSeqDenseCUDACopyToGPU(A)); // Since we will read the array on device, we sync the GPU data if necessary
993: *array = dA->d_v;
994: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
995: PetscFunctionReturn(PETSC_SUCCESS);
996: }
998: static PetscErrorCode MatDenseRestoreArrayAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array)
999: {
1000: PetscFunctionBegin;
1001: *array = nullptr;
1002: A->offloadmask = PETSC_OFFLOAD_GPU; // Since we've written to the array on device
1003: PetscFunctionReturn(PETSC_SUCCESS);
1004: }
1006: static PetscErrorCode (*MatDenseGetArrayReadAndMemType_SeqDenseCUDA)(Mat, PetscScalar **, PetscMemType *) = MatDenseGetArrayAndMemType_SeqDenseCUDA;
1007: static PetscErrorCode (*MatDenseRestoreArrayReadAndMemType_SeqDenseCUDA)(Mat, PetscScalar **) = nullptr; // Keep the offload mask as is
1009: static PetscErrorCode MatDenseGetArrayWriteAndMemType_SeqDenseCUDA(Mat A, PetscScalar **array, PetscMemType *mtype)
1010: {
1011: const auto dA = static_cast<Mat_SeqDenseCUDA *>(A->spptr);
1013: PetscFunctionBegin;
1014: if (!dA->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(A, nullptr)); // Allocate GPU memory if not present
1015: *array = dA->d_v;
1016: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1017: PetscFunctionReturn(PETSC_SUCCESS);
1018: }
1020: static PetscErrorCode (*MatDenseRestoreArrayWriteAndMemType_SeqDenseCUDA)(Mat, PetscScalar **) = MatDenseRestoreArrayAndMemType_SeqDenseCUDA; // Since we've written to the array on device
1022: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y, PetscScalar alpha)
1023: {
1024: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1025: PetscScalar *dy;
1026: PetscCuBLASInt j, N, m, lday, one = 1;
1027: cublasHandle_t cublasv2handle;
1029: PetscFunctionBegin;
1030: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1031: PetscCall(MatDenseCUDAGetArray(Y, &dy));
1032: PetscCall(PetscCuBLASIntCast(Y->rmap->n * Y->cmap->n, &N));
1033: PetscCall(PetscCuBLASIntCast(Y->rmap->n, &m));
1034: PetscCall(PetscCuBLASIntCast(y->lda, &lday));
1035: PetscCall(PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1036: PetscCall(PetscLogGpuTimeBegin());
1037: if (lday > m) {
1038: for (j = 0; j < Y->cmap->n; j++) PetscCallCUBLAS(cublasXscal(cublasv2handle, m, &alpha, dy + lday * j, one));
1039: } else PetscCallCUBLAS(cublasXscal(cublasv2handle, N, &alpha, dy, one));
1040: PetscCall(PetscLogGpuTimeEnd());
1041: PetscCall(PetscLogGpuFlops(N));
1042: PetscCall(MatDenseCUDARestoreArray(Y, &dy));
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1047: const PetscScalar shift_;
1048: petscshift(PetscScalar shift) : shift_(shift) { }
1049: __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1050: };
1052: template <typename Iterator>
1053: class strided_range {
1054: public:
1055: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1056: struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1057: difference_type stride;
1058: stride_functor(difference_type stride) : stride(stride) { }
1059: __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1060: };
1061: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
1062: typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1063: typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
1064: typedef PermutationIterator iterator; // type of the strided_range iterator
1065: // construct strided_range for the range [first,last)
1066: strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1067: iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1068: iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }
1070: protected:
1071: Iterator first;
1072: Iterator last;
1073: difference_type stride;
1074: };
1076: PetscErrorCode MatShift_DenseCUDA_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1077: {
1078: PetscFunctionBegin;
1079: PetscInt rend2 = PetscMin(rend, cols);
1080: if (rend2 > rstart) {
1081: PetscCall(PetscLogGpuTimeBegin());
1082: try {
1083: const auto dptr = thrust::device_pointer_cast(da);
1084: size_t begin = rstart * lda;
1085: size_t end = rend2 - rstart + rend2 * lda;
1086: strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1087: thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1088: } catch (char *ex) {
1089: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1090: }
1091: PetscCall(PetscLogGpuTimeEnd());
1092: PetscCall(PetscLogGpuFlops(rend2 - rstart));
1093: }
1094: PetscFunctionReturn(PETSC_SUCCESS);
1095: }
1097: PetscErrorCode MatShift_SeqDenseCUDA(Mat A, PetscScalar alpha)
1098: {
1099: PetscScalar *da;
1100: PetscInt m = A->rmap->n, n = A->cmap->n, lda;
1102: PetscFunctionBegin;
1103: PetscCall(MatDenseCUDAGetArray(A, &da));
1104: PetscCall(MatDenseGetLDA(A, &lda));
1105: PetscCall(PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1106: PetscCall(MatShift_DenseCUDA_Private(da, alpha, lda, 0, m, n));
1107: PetscCall(MatDenseCUDARestoreArray(A, &da));
1108: PetscFunctionReturn(PETSC_SUCCESS);
1109: }
1111: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y, PetscScalar alpha, Mat X, MatStructure)
1112: {
1113: Mat_SeqDense *x = (Mat_SeqDense *)X->data;
1114: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1115: const PetscScalar *dx;
1116: PetscScalar *dy;
1117: PetscCuBLASInt j, N, m, ldax, lday, one = 1;
1118: cublasHandle_t cublasv2handle;
1120: PetscFunctionBegin;
1121: if (!X->rmap->n || !X->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1122: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
1123: PetscCall(MatDenseCUDAGetArrayRead(X, &dx));
1124: if (alpha == 0.0) PetscCall(MatDenseCUDAGetArrayWrite(Y, &dy));
1125: else PetscCall(MatDenseCUDAGetArray(Y, &dy));
1126: PetscCall(PetscCuBLASIntCast(X->rmap->n * X->cmap->n, &N));
1127: PetscCall(PetscCuBLASIntCast(X->rmap->n, &m));
1128: PetscCall(PetscCuBLASIntCast(x->lda, &ldax));
1129: PetscCall(PetscCuBLASIntCast(y->lda, &lday));
1130: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n));
1131: PetscCall(PetscLogGpuTimeBegin());
1132: if (ldax > m || lday > m) {
1133: for (j = 0; j < X->cmap->n; j++) PetscCallCUBLAS(cublasXaxpy(cublasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one));
1134: } else PetscCallCUBLAS(cublasXaxpy(cublasv2handle, N, &alpha, dx, one, dy, one));
1135: PetscCall(PetscLogGpuTimeEnd());
1136: PetscCall(PetscLogGpuFlops(PetscMax(2. * N - 1, 0)));
1137: PetscCall(MatDenseCUDARestoreArrayRead(X, &dx));
1138: if (alpha == 0.0) PetscCall(MatDenseCUDARestoreArrayWrite(Y, &dy));
1139: else PetscCall(MatDenseCUDARestoreArray(Y, &dy));
1140: PetscFunctionReturn(PETSC_SUCCESS);
1141: }
1143: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1144: {
1145: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1147: PetscFunctionBegin;
1148: if (dA) {
1149: PetscCheck(!dA->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDenseCUDAResetArray() must be called first");
1150: if (!dA->user_alloc) PetscCallCUDA(cudaFree(dA->d_v));
1151: PetscCallCUDA(cudaFree(dA->d_fact_tau));
1152: PetscCallCUDA(cudaFree(dA->d_fact_ipiv));
1153: PetscCallCUDA(cudaFree(dA->d_fact_info));
1154: PetscCallCUDA(cudaFree(dA->d_fact_work));
1155: PetscCall(VecDestroy(&dA->workvec));
1156: }
1157: PetscCall(PetscFree(A->spptr));
1158: PetscFunctionReturn(PETSC_SUCCESS);
1159: }
1161: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1162: {
1163: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1165: PetscFunctionBegin;
1166: /* prevent to copy back data if we own the data pointer */
1167: if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1168: PetscCall(MatConvert_SeqDenseCUDA_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1169: PetscCall(MatDestroy_SeqDense(A));
1170: PetscFunctionReturn(PETSC_SUCCESS);
1171: }
1173: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
1174: {
1175: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1177: PetscFunctionBegin;
1178: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1179: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1180: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
1181: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues));
1182: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) PetscCall(MatCopy_SeqDenseCUDA(A, *B, SAME_NONZERO_PATTERN));
1183: if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1184: Mat_SeqDenseCUDA *dB = (Mat_SeqDenseCUDA *)(*B)->spptr;
1185: if (!dB->d_v) PetscCall(MatSeqDenseCUDASetPreallocation(*B, NULL));
1186: }
1187: PetscFunctionReturn(PETSC_SUCCESS);
1188: }
1190: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A, Vec v, PetscInt col)
1191: {
1192: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1193: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1194: PetscScalar *x;
1195: PetscBool viscuda;
1197: PetscFunctionBegin;
1198: PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, VECCUDA, ""));
1199: if (viscuda && !v->boundtocpu) { /* update device data */
1200: PetscCall(VecCUDAGetArrayWrite(v, &x));
1201: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1202: PetscCallCUDA(cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToHost));
1203: } else {
1204: PetscCallCUDA(cudaMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
1205: }
1206: PetscCall(VecCUDARestoreArrayWrite(v, &x));
1207: } else { /* update host data */
1208: PetscCall(VecGetArrayWrite(v, &x));
1209: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1210: PetscCall(PetscArraycpy(x, a->v + col * a->lda, A->rmap->n));
1211: } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1212: PetscCallCUDA(cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1213: }
1214: PetscCall(VecRestoreArrayWrite(v, &x));
1215: }
1216: PetscFunctionReturn(PETSC_SUCCESS);
1217: }
1219: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A, MatFactorType ftype, Mat *fact)
1220: {
1221: PetscFunctionBegin;
1222: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), fact));
1223: PetscCall(MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
1224: PetscCall(MatSetType(*fact, MATSEQDENSECUDA));
1225: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1226: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1227: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1228: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1229: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1230: } else if (ftype == MAT_FACTOR_QR) {
1231: PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense));
1232: PetscCall(PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1233: }
1234: (*fact)->factortype = ftype;
1235: PetscCall(PetscFree((*fact)->solvertype));
1236: PetscCall(PetscStrallocpy(MATSOLVERCUDA, &(*fact)->solvertype));
1237: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]));
1238: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]));
1239: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]));
1240: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]));
1241: PetscFunctionReturn(PETSC_SUCCESS);
1242: }
1244: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1245: {
1246: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1248: PetscFunctionBegin;
1249: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1250: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1251: PetscCall(MatDenseCUDAGetArray(A, (PetscScalar **)&a->ptrinuse));
1252: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1253: PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1254: }
1255: a->vecinuse = col + 1;
1256: PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1257: *v = a->cvec;
1258: PetscFunctionReturn(PETSC_SUCCESS);
1259: }
1261: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1262: {
1263: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1265: PetscFunctionBegin;
1266: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1267: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1268: a->vecinuse = 0;
1269: PetscCall(VecCUDAResetArray(a->cvec));
1270: PetscCall(MatDenseCUDARestoreArray(A, (PetscScalar **)&a->ptrinuse));
1271: if (v) *v = NULL;
1272: PetscFunctionReturn(PETSC_SUCCESS);
1273: }
1275: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1276: {
1277: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1279: PetscFunctionBegin;
1280: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1281: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1282: PetscCall(MatDenseCUDAGetArrayRead(A, &a->ptrinuse));
1283: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1284: PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1285: }
1286: a->vecinuse = col + 1;
1287: PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1288: PetscCall(VecLockReadPush(a->cvec));
1289: *v = a->cvec;
1290: PetscFunctionReturn(PETSC_SUCCESS);
1291: }
1293: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1294: {
1295: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1297: PetscFunctionBegin;
1298: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1299: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1300: a->vecinuse = 0;
1301: PetscCall(VecLockReadPop(a->cvec));
1302: PetscCall(VecCUDAResetArray(a->cvec));
1303: PetscCall(MatDenseCUDARestoreArrayRead(A, &a->ptrinuse));
1304: if (v) *v = NULL;
1305: PetscFunctionReturn(PETSC_SUCCESS);
1306: }
1308: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1309: {
1310: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1312: PetscFunctionBegin;
1313: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1314: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1315: PetscCall(MatDenseCUDAGetArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1316: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1317: PetscCall(VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec));
1318: }
1319: a->vecinuse = col + 1;
1320: PetscCall(VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda));
1321: *v = a->cvec;
1322: PetscFunctionReturn(PETSC_SUCCESS);
1323: }
1325: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt, Vec *v)
1326: {
1327: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1329: PetscFunctionBegin;
1330: PetscCheck(a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1331: PetscCheck(a->cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1332: a->vecinuse = 0;
1333: PetscCall(VecCUDAResetArray(a->cvec));
1334: PetscCall(MatDenseCUDARestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse));
1335: if (v) *v = NULL;
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1340: {
1341: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1342: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1344: PetscFunctionBegin;
1345: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1346: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1347: if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) PetscCall(MatDestroy(&a->cmat));
1348: PetscCall(MatSeqDenseCUDACopyToGPU(A));
1349: if (!a->cmat) {
1350: PetscCall(MatCreateDenseCUDA(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat));
1351: } else {
1352: PetscCall(MatDenseCUDAPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda));
1353: }
1354: PetscCall(MatDenseSetLDA(a->cmat, a->lda));
1355: /* Place CPU array if present but not copy any data */
1356: a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1357: if (a->v) PetscCall(MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda));
1358: a->cmat->offloadmask = A->offloadmask;
1359: a->matinuse = cbegin + 1;
1360: *v = a->cmat;
1361: PetscFunctionReturn(PETSC_SUCCESS);
1362: }
1364: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A, Mat *v)
1365: {
1366: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1367: PetscBool copy = PETSC_FALSE, reset;
1368: PetscOffloadMask suboff;
1370: PetscFunctionBegin;
1371: PetscCheck(a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1372: PetscCheck(a->cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1373: PetscCheck(*v == a->cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1374: a->matinuse = 0;
1375: reset = a->v ? PETSC_TRUE : PETSC_FALSE;
1376: suboff = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1377: if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1378: copy = PETSC_TRUE;
1379: PetscCall(MatSeqDenseSetPreallocation(A, NULL));
1380: }
1381: PetscCall(MatDenseCUDAResetArray(a->cmat));
1382: if (reset) PetscCall(MatDenseResetArray(a->cmat));
1383: if (copy) {
1384: PetscCall(MatSeqDenseCUDACopyFromGPU(A));
1385: } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1386: a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1387: if (v) *v = NULL;
1388: PetscFunctionReturn(PETSC_SUCCESS);
1389: }
1391: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A, PetscInt lda)
1392: {
1393: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
1394: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1395: PetscBool data;
1397: PetscFunctionBegin;
1398: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1399: PetscCheck(dA->user_alloc || !data || cA->lda == lda, PETSC_COMM_SELF, PETSC_ERR_ORDER, "LDA cannot be changed after allocation of internal storage");
1400: PetscCheck(lda >= A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "LDA %" PetscInt_FMT " must be at least matrix dimension %" PetscInt_FMT, lda, A->rmap->n);
1401: cA->lda = lda;
1402: PetscFunctionReturn(PETSC_SUCCESS);
1403: }
1405: static PetscErrorCode MatSetUp_SeqDenseCUDA(Mat A)
1406: {
1407: PetscFunctionBegin;
1408: PetscCall(PetscLayoutSetUp(A->rmap));
1409: PetscCall(PetscLayoutSetUp(A->cmap));
1410: if (!A->preallocated) PetscCall(MatSeqDenseCUDASetPreallocation(A, NULL));
1411: PetscFunctionReturn(PETSC_SUCCESS);
1412: }
1414: static PetscErrorCode MatSetRandom_SeqDenseCUDA(Mat A, PetscRandom r)
1415: {
1416: PetscBool iscurand;
1418: PetscFunctionBegin;
1419: PetscCall(PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand));
1420: if (iscurand) {
1421: PetscScalar *a;
1422: PetscInt lda, m, n, mn = 0;
1424: PetscCall(MatGetSize(A, &m, &n));
1425: PetscCall(MatDenseGetLDA(A, &lda));
1426: PetscCall(MatDenseCUDAGetArrayWrite(A, &a));
1427: if (lda > m) {
1428: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(r, m, a + i * lda));
1429: } else {
1430: PetscCall(PetscIntMultError(m, n, &mn));
1431: PetscCall(PetscRandomGetValues(r, mn, a));
1432: }
1433: PetscCall(MatDenseCUDARestoreArrayWrite(A, &a));
1434: } else PetscCall(MatSetRandom_SeqDense(A, r));
1435: PetscFunctionReturn(PETSC_SUCCESS);
1436: }
1438: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A, PetscBool flg)
1439: {
1440: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1442: PetscFunctionBegin;
1443: PetscCheck(!a->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1444: PetscCheck(!a->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1445: A->boundtocpu = flg;
1446: if (!flg) {
1447: PetscBool iscuda;
1449: PetscCall(PetscFree(A->defaultrandtype));
1450: PetscCall(PetscStrallocpy(PETSCCURAND, &A->defaultrandtype));
1451: PetscCall(PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQCUDA, &iscuda));
1452: if (!iscuda) PetscCall(VecDestroy(&a->cvec));
1453: PetscCall(PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSECUDA, &iscuda));
1454: if (!iscuda) PetscCall(MatDestroy(&a->cmat));
1455: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA));
1456: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA));
1457: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseCUDA));
1458: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseCUDA));
1459: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseCUDA));
1460: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseCUDA));
1461: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseCUDA));
1462: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseCUDA));
1463: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseCUDA));
1464: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseCUDA));
1465: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseCUDA));
1466: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseCUDA));
1467: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseCUDA));
1469: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", MatDenseGetArrayAndMemType_SeqDenseCUDA));
1470: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", MatDenseRestoreArrayAndMemType_SeqDenseCUDA));
1471: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", MatDenseGetArrayReadAndMemType_SeqDenseCUDA));
1472: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", MatDenseRestoreArrayReadAndMemType_SeqDenseCUDA));
1473: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", MatDenseGetArrayWriteAndMemType_SeqDenseCUDA));
1474: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", MatDenseRestoreArrayWriteAndMemType_SeqDenseCUDA));
1476: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1477: A->ops->mult = MatMult_SeqDenseCUDA;
1478: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1479: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1480: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1481: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1482: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1483: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1484: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1485: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1486: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1487: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1488: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1489: A->ops->scale = MatScale_SeqDenseCUDA;
1490: A->ops->shift = MatShift_SeqDenseCUDA;
1491: A->ops->copy = MatCopy_SeqDenseCUDA;
1492: A->ops->zeroentries = MatZeroEntries_SeqDenseCUDA;
1493: A->ops->setup = MatSetUp_SeqDenseCUDA;
1494: A->ops->setrandom = MatSetRandom_SeqDenseCUDA;
1495: } else {
1496: /* make sure we have an up-to-date copy on the CPU */
1497: PetscCall(MatSeqDenseCUDACopyFromGPU(A));
1498: PetscCall(PetscFree(A->defaultrandtype));
1499: PetscCall(PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype));
1500: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense));
1501: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense));
1502: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense));
1503: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense));
1504: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense));
1505: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense));
1506: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense));
1507: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense));
1508: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense));
1509: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense));
1510: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense));
1511: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1512: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense));
1514: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayAndMemType_C", NULL));
1515: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayAndMemType_C", NULL));
1516: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayReadAndMemType_C", NULL));
1517: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayReadAndMemType_C", NULL));
1518: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWriteAndMemType_C", NULL));
1519: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreArrayWriteAndMemType_C", NULL));
1521: A->ops->duplicate = MatDuplicate_SeqDense;
1522: A->ops->mult = MatMult_SeqDense;
1523: A->ops->multadd = MatMultAdd_SeqDense;
1524: A->ops->multtranspose = MatMultTranspose_SeqDense;
1525: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1526: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1527: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1528: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1529: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1530: A->ops->axpy = MatAXPY_SeqDense;
1531: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1532: A->ops->lufactor = MatLUFactor_SeqDense;
1533: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1534: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1535: A->ops->scale = MatScale_SeqDense;
1536: A->ops->shift = MatShift_SeqDense;
1537: A->ops->copy = MatCopy_SeqDense;
1538: A->ops->zeroentries = MatZeroEntries_SeqDense;
1539: A->ops->setup = MatSetUp_SeqDense;
1540: A->ops->setrandom = MatSetRandom_SeqDense;
1541: }
1542: if (a->cmat) PetscCall(MatBindToCPU(a->cmat, flg));
1543: PetscFunctionReturn(PETSC_SUCCESS);
1544: }
1546: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1547: {
1548: Mat B;
1549: Mat_SeqDense *a;
1551: PetscFunctionBegin;
1552: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1553: /* TODO these cases should be optimized */
1554: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1555: PetscFunctionReturn(PETSC_SUCCESS);
1556: }
1558: B = *newmat;
1559: PetscCall(MatBindToCPU_SeqDenseCUDA(B, PETSC_TRUE));
1560: PetscCall(MatReset_SeqDenseCUDA(B));
1561: PetscCall(PetscFree(B->defaultvectype));
1562: PetscCall(PetscStrallocpy(VECSTANDARD, &B->defaultvectype));
1563: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE));
1564: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", NULL));
1565: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", NULL));
1566: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", NULL));
1567: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", NULL));
1568: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", NULL));
1569: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", NULL));
1570: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", NULL));
1571: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", NULL));
1572: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", NULL));
1573: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", NULL));
1574: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", NULL));
1575: a = (Mat_SeqDense *)B->data;
1576: PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1577: B->ops->bindtocpu = NULL;
1578: B->ops->destroy = MatDestroy_SeqDense;
1579: B->offloadmask = PETSC_OFFLOAD_CPU;
1580: PetscFunctionReturn(PETSC_SUCCESS);
1581: }
1583: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1584: {
1585: Mat_SeqDenseCUDA *dB;
1586: Mat B;
1587: Mat_SeqDense *a;
1589: PetscFunctionBegin;
1590: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1591: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1592: /* TODO these cases should be optimized */
1593: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
1594: PetscFunctionReturn(PETSC_SUCCESS);
1595: }
1597: B = *newmat;
1598: PetscCall(PetscFree(B->defaultvectype));
1599: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
1600: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSECUDA));
1601: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense));
1602: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA));
1603: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA));
1604: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA));
1605: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA));
1606: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA));
1607: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA));
1608: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA));
1609: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA));
1610: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA));
1611: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", MatProductSetFromOptions_SeqAIJ_SeqDense));
1612: a = (Mat_SeqDense *)B->data;
1613: PetscCall(VecDestroy(&a->cvec)); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1614: PetscCall(PetscNew(&dB));
1616: B->spptr = dB;
1617: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1619: PetscCall(MatBindToCPU_SeqDenseCUDA(B, PETSC_FALSE));
1620: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1621: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1622: PetscFunctionReturn(PETSC_SUCCESS);
1623: }
1625: /*@C
1626: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1628: Collective
1630: Input Parameters:
1631: + comm - MPI communicator
1632: . m - number of rows
1633: . n - number of columns
1634: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1635: to control matrix memory allocation.
1637: Output Parameter:
1638: . A - the matrix
1640: Level: intermediate
1642: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1643: @*/
1644: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1645: {
1646: PetscMPIInt size;
1648: PetscFunctionBegin;
1649: PetscCallMPI(MPI_Comm_size(comm, &size));
1650: PetscCheck(size <= 1, comm, PETSC_ERR_ARG_WRONG, "Invalid communicator size %d", size);
1651: PetscCall(MatCreate(comm, A));
1652: PetscCall(MatSetSizes(*A, m, n, m, n));
1653: PetscCall(MatSetType(*A, MATSEQDENSECUDA));
1654: PetscCall(MatSeqDenseCUDASetPreallocation(*A, data));
1655: PetscFunctionReturn(PETSC_SUCCESS);
1656: }
1658: /*MC
1659: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1661: Options Database Keys:
1662: . -mat_type seqdensecuda - sets the matrix type to `MATSEQDENSECUDA` during a call to MatSetFromOptions()
1664: Level: beginner
1666: .seealso: `MATSEQDENSE`
1667: M*/
1668: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1669: {
1670: PetscFunctionBegin;
1671: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1672: PetscCall(MatCreate_SeqDense(B));
1673: PetscCall(MatConvert_SeqDense_SeqDenseCUDA(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1674: PetscFunctionReturn(PETSC_SUCCESS);
1675: }