Actual source code: aijcusparse.cu
petsc-master 2020-08-25
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
37: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
38: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
40: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
41: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
42: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
43: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
44: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
45: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
47: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
48: {
49: cusparseStatus_t stat;
50: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53: cusparsestruct->stream = stream;
54: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
55: return(0);
56: }
58: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
59: {
60: cusparseStatus_t stat;
61: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
64: if (cusparsestruct->handle != handle) {
65: if (cusparsestruct->handle) {
66: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
67: }
68: cusparsestruct->handle = handle;
69: }
70: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
71: return(0);
72: }
74: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
75: {
76: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
79: if (cusparsestruct->handle) cusparsestruct->handle = 0;
80: return(0);
81: }
83: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
84: {
86: *type = MATSOLVERCUSPARSE;
87: return(0);
88: }
90: /*MC
91: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96: algorithms are not recommended. This class does NOT support direct solver operations.
98: Level: beginner
100: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101: M*/
103: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104: {
106: PetscInt n = A->rmap->n;
109: MatCreate(PetscObjectComm((PetscObject)A),B);
110: MatSetSizes(*B,n,n,n,n);
111: (*B)->factortype = ftype;
112: (*B)->useordering = PETSC_TRUE;
113: MatSetType(*B,MATSEQAIJCUSPARSE);
115: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116: MatSetBlockSizesFromMats(*B,A,A);
117: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
125: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
126: return(0);
127: }
129: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
130: {
131: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
134: switch (op) {
135: case MAT_CUSPARSE_MULT:
136: cusparsestruct->format = format;
137: break;
138: case MAT_CUSPARSE_ALL:
139: cusparsestruct->format = format;
140: break;
141: default:
142: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
143: }
144: return(0);
145: }
147: /*@
148: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149: operation. Only the MatMult operation can use different GPU storage formats
150: for MPIAIJCUSPARSE matrices.
151: Not Collective
153: Input Parameters:
154: + A - Matrix of type SEQAIJCUSPARSE
155: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
158: Output Parameter:
160: Level: intermediate
162: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163: @*/
164: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165: {
170: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
171: return(0);
172: }
174: /*@
175: MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
177: Collective on mat
179: Input Parameters:
180: + A - Matrix of type SEQAIJCUSPARSE
181: - transgen - the boolean flag
183: Level: intermediate
185: .seealso: MATSEQAIJCUSPARSE
186: @*/
187: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
188: {
190: PetscBool flg;
194: PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
195: if (flg) {
196: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
198: if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
199: cusp->transgen = transgen;
200: if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
201: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
202: }
203: }
204: return(0);
205: }
207: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
208: {
209: PetscErrorCode ierr;
210: MatCUSPARSEStorageFormat format;
211: PetscBool flg;
212: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
215: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
216: if (A->factortype == MAT_FACTOR_NONE) {
217: PetscBool transgen = cusparsestruct->transgen;
219: PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
220: if (flg) {
221: MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);
222: }
223: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
224: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
225: if (flg) {
226: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);
227: }
228: }
229: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
230: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
231: if (flg) {
232: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
233: }
234: PetscOptionsTail();
235: return(0);
236: }
238: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
239: {
243: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
244: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
245: return(0);
246: }
248: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
249: {
253: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
254: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
255: return(0);
256: }
258: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259: {
263: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
264: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265: return(0);
266: }
268: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
269: {
273: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
274: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
275: return(0);
276: }
278: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
279: {
280: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
281: PetscInt n = A->rmap->n;
282: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
283: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
284: cusparseStatus_t stat;
285: const PetscInt *ai = a->i,*aj = a->j,*vi;
286: const MatScalar *aa = a->a,*v;
287: PetscInt *AiLo, *AjLo;
288: PetscScalar *AALo;
289: PetscInt i,nz, nzLower, offset, rowOffset;
290: PetscErrorCode ierr;
291: cudaError_t cerr;
294: if (!n) return(0);
295: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
296: try {
297: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
298: nzLower=n+ai[n]-ai[1];
300: /* Allocate Space for the lower triangular matrix */
301: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
302: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
303: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
305: /* Fill the lower triangular matrix */
306: AiLo[0] = (PetscInt) 0;
307: AiLo[n] = nzLower;
308: AjLo[0] = (PetscInt) 0;
309: AALo[0] = (MatScalar) 1.0;
310: v = aa;
311: vi = aj;
312: offset = 1;
313: rowOffset= 1;
314: for (i=1; i<n; i++) {
315: nz = ai[i+1] - ai[i];
316: /* additional 1 for the term on the diagonal */
317: AiLo[i] = rowOffset;
318: rowOffset += nz+1;
320: PetscArraycpy(&(AjLo[offset]), vi, nz);
321: PetscArraycpy(&(AALo[offset]), v, nz);
323: offset += nz;
324: AjLo[offset] = (PetscInt) i;
325: AALo[offset] = (MatScalar) 1.0;
326: offset += 1;
328: v += nz;
329: vi += nz;
330: }
332: /* allocate space for the triangular factor information */
333: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
335: /* Create the matrix description */
336: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
337: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
338: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
339: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
340: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
342: /* Create the solve analysis information */
343: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
345: /* set the operation */
346: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
348: /* set the matrix */
349: loTriFactor->csrMat = new CsrMatrix;
350: loTriFactor->csrMat->num_rows = n;
351: loTriFactor->csrMat->num_cols = n;
352: loTriFactor->csrMat->num_entries = nzLower;
354: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
355: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
357: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
358: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
360: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
361: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
363: /* perform the solve analysis */
364: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
365: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
366: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
367: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
369: /* assign the pointer. Is this really necessary? */
370: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
372: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
373: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
374: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
375: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
376: } catch(char *ex) {
377: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
378: }
379: }
380: return(0);
381: }
383: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
386: PetscInt n = A->rmap->n;
387: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
388: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
389: cusparseStatus_t stat;
390: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
391: const MatScalar *aa = a->a,*v;
392: PetscInt *AiUp, *AjUp;
393: PetscScalar *AAUp;
394: PetscInt i,nz, nzUpper, offset;
395: PetscErrorCode ierr;
396: cudaError_t cerr;
399: if (!n) return(0);
400: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
401: try {
402: /* next, figure out the number of nonzeros in the upper triangular matrix. */
403: nzUpper = adiag[0]-adiag[n];
405: /* Allocate Space for the upper triangular matrix */
406: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
407: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
408: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
410: /* Fill the upper triangular matrix */
411: AiUp[0]=(PetscInt) 0;
412: AiUp[n]=nzUpper;
413: offset = nzUpper;
414: for (i=n-1; i>=0; i--) {
415: v = aa + adiag[i+1] + 1;
416: vi = aj + adiag[i+1] + 1;
418: /* number of elements NOT on the diagonal */
419: nz = adiag[i] - adiag[i+1]-1;
421: /* decrement the offset */
422: offset -= (nz+1);
424: /* first, set the diagonal elements */
425: AjUp[offset] = (PetscInt) i;
426: AAUp[offset] = (MatScalar)1./v[nz];
427: AiUp[i] = AiUp[i+1] - (nz+1);
429: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
430: PetscArraycpy(&(AAUp[offset+1]), v, nz);
431: }
433: /* allocate space for the triangular factor information */
434: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
436: /* Create the matrix description */
437: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
438: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
439: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
440: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
441: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
443: /* Create the solve analysis information */
444: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
446: /* set the operation */
447: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
449: /* set the matrix */
450: upTriFactor->csrMat = new CsrMatrix;
451: upTriFactor->csrMat->num_rows = n;
452: upTriFactor->csrMat->num_cols = n;
453: upTriFactor->csrMat->num_entries = nzUpper;
455: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
456: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
458: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
461: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
464: /* perform the solve analysis */
465: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
466: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
467: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
468: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
470: /* assign the pointer. Is this really necessary? */
471: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
473: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
474: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
475: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
476: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
477: } catch(char *ex) {
478: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
479: }
480: }
481: return(0);
482: }
484: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
485: {
486: PetscErrorCode ierr;
487: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
488: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489: IS isrow = a->row,iscol = a->icol;
490: PetscBool row_identity,col_identity;
491: const PetscInt *r,*c;
492: PetscInt n = A->rmap->n;
495: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
496: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
497: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
499: cusparseTriFactors->workVector = new THRUSTARRAY(n);
500: cusparseTriFactors->nnz=a->nz;
502: A->offloadmask = PETSC_OFFLOAD_BOTH;
503: /* lower triangular indices */
504: ISGetIndices(isrow,&r);
505: ISIdentity(isrow,&row_identity);
506: if (!row_identity) {
507: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
508: cusparseTriFactors->rpermIndices->assign(r, r+n);
509: }
510: ISRestoreIndices(isrow,&r);
512: /* upper triangular indices */
513: ISGetIndices(iscol,&c);
514: ISIdentity(iscol,&col_identity);
515: if (!col_identity) {
516: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
517: cusparseTriFactors->cpermIndices->assign(c, c+n);
518: }
520: if (!row_identity && !col_identity) {
521: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
522: } else if(!row_identity) {
523: PetscLogCpuToGpu(n*sizeof(PetscInt));
524: } else if(!col_identity) {
525: PetscLogCpuToGpu(n*sizeof(PetscInt));
526: }
528: ISRestoreIndices(iscol,&c);
529: return(0);
530: }
532: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
533: {
534: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
535: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
536: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
537: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
538: cusparseStatus_t stat;
539: PetscErrorCode ierr;
540: cudaError_t cerr;
541: PetscInt *AiUp, *AjUp;
542: PetscScalar *AAUp;
543: PetscScalar *AALo;
544: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
545: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
546: const PetscInt *ai = b->i,*aj = b->j,*vj;
547: const MatScalar *aa = b->a,*v;
550: if (!n) return(0);
551: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
552: try {
553: /* Allocate Space for the upper triangular matrix */
554: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
555: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
556: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
557: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
559: /* Fill the upper triangular matrix */
560: AiUp[0]=(PetscInt) 0;
561: AiUp[n]=nzUpper;
562: offset = 0;
563: for (i=0; i<n; i++) {
564: /* set the pointers */
565: v = aa + ai[i];
566: vj = aj + ai[i];
567: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
569: /* first, set the diagonal elements */
570: AjUp[offset] = (PetscInt) i;
571: AAUp[offset] = (MatScalar)1.0/v[nz];
572: AiUp[i] = offset;
573: AALo[offset] = (MatScalar)1.0/v[nz];
575: offset+=1;
576: if (nz>0) {
577: PetscArraycpy(&(AjUp[offset]), vj, nz);
578: PetscArraycpy(&(AAUp[offset]), v, nz);
579: for (j=offset; j<offset+nz; j++) {
580: AAUp[j] = -AAUp[j];
581: AALo[j] = AAUp[j]/v[nz];
582: }
583: offset+=nz;
584: }
585: }
587: /* allocate space for the triangular factor information */
588: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
590: /* Create the matrix description */
591: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
592: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
593: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
594: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
595: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
597: /* Create the solve analysis information */
598: stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600: /* set the operation */
601: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
603: /* set the matrix */
604: upTriFactor->csrMat = new CsrMatrix;
605: upTriFactor->csrMat->num_rows = A->rmap->n;
606: upTriFactor->csrMat->num_cols = A->cmap->n;
607: upTriFactor->csrMat->num_entries = a->nz;
609: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
610: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
612: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
613: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
615: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
616: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
618: /* perform the solve analysis */
619: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
620: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
621: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
622: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
624: /* assign the pointer. Is this really necessary? */
625: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
627: /* allocate space for the triangular factor information */
628: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
630: /* Create the matrix description */
631: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
632: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
633: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
634: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
635: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
637: /* Create the solve analysis information */
638: stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
640: /* set the operation */
641: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
643: /* set the matrix */
644: loTriFactor->csrMat = new CsrMatrix;
645: loTriFactor->csrMat->num_rows = A->rmap->n;
646: loTriFactor->csrMat->num_cols = A->cmap->n;
647: loTriFactor->csrMat->num_entries = a->nz;
649: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
650: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
652: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
653: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
655: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
657: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
659: /* perform the solve analysis */
660: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
661: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
662: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
663: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
665: /* assign the pointer. Is this really necessary? */
666: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
668: A->offloadmask = PETSC_OFFLOAD_BOTH;
669: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
670: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
671: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
672: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
673: } catch(char *ex) {
674: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
675: }
676: }
677: return(0);
678: }
680: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
681: {
682: PetscErrorCode ierr;
683: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
684: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
685: IS ip = a->row;
686: const PetscInt *rip;
687: PetscBool perm_identity;
688: PetscInt n = A->rmap->n;
691: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
692: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
693: cusparseTriFactors->workVector = new THRUSTARRAY(n);
694: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
696: /* lower triangular indices */
697: ISGetIndices(ip,&rip);
698: ISIdentity(ip,&perm_identity);
699: if (!perm_identity) {
700: IS iip;
701: const PetscInt *irip;
703: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
704: ISGetIndices(iip,&irip);
705: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
706: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
707: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
708: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
709: ISRestoreIndices(iip,&irip);
710: ISDestroy(&iip);
711: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
712: }
713: ISRestoreIndices(ip,&rip);
714: return(0);
715: }
717: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
718: {
719: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
720: IS isrow = b->row,iscol = b->col;
721: PetscBool row_identity,col_identity;
725: MatLUFactorNumeric_SeqAIJ(B,A,info);
726: B->offloadmask = PETSC_OFFLOAD_CPU;
727: /* determine which version of MatSolve needs to be used. */
728: ISIdentity(isrow,&row_identity);
729: ISIdentity(iscol,&col_identity);
730: if (row_identity && col_identity) {
731: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
732: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
733: B->ops->matsolve = NULL;
734: B->ops->matsolvetranspose = NULL;
735: } else {
736: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
737: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
738: B->ops->matsolve = NULL;
739: B->ops->matsolvetranspose = NULL;
740: }
742: /* get the triangular factors */
743: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
744: return(0);
745: }
747: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
748: {
749: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
750: IS ip = b->row;
751: PetscBool perm_identity;
755: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
756: B->offloadmask = PETSC_OFFLOAD_CPU;
757: /* determine which version of MatSolve needs to be used. */
758: ISIdentity(ip,&perm_identity);
759: if (perm_identity) {
760: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
761: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
762: B->ops->matsolve = NULL;
763: B->ops->matsolvetranspose = NULL;
764: } else {
765: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
766: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
767: B->ops->matsolve = NULL;
768: B->ops->matsolvetranspose = NULL;
769: }
771: /* get the triangular factors */
772: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
773: return(0);
774: }
776: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
777: {
778: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
779: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
780: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
781: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
782: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
783: cusparseStatus_t stat;
784: cusparseIndexBase_t indexBase;
785: cusparseMatrixType_t matrixType;
786: cusparseFillMode_t fillMode;
787: cusparseDiagType_t diagType;
791: /*********************************************/
792: /* Now the Transpose of the Lower Tri Factor */
793: /*********************************************/
795: /* allocate space for the transpose of the lower triangular factor */
796: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
798: /* set the matrix descriptors of the lower triangular factor */
799: matrixType = cusparseGetMatType(loTriFactor->descr);
800: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
801: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
802: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
803: diagType = cusparseGetMatDiagType(loTriFactor->descr);
805: /* Create the matrix description */
806: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
807: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
808: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
809: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
810: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
812: /* Create the solve analysis information */
813: stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
815: /* set the operation */
816: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
818: /* allocate GPU space for the CSC of the lower triangular factor*/
819: loTriFactorT->csrMat = new CsrMatrix;
820: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
821: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
822: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
823: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
824: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
825: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
827: /* compute the transpose of the lower triangular factor, i.e. the CSC */
828: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
829: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
830: loTriFactor->csrMat->values->data().get(),
831: loTriFactor->csrMat->row_offsets->data().get(),
832: loTriFactor->csrMat->column_indices->data().get(),
833: loTriFactorT->csrMat->values->data().get(),
834: loTriFactorT->csrMat->column_indices->data().get(),
835: loTriFactorT->csrMat->row_offsets->data().get(),
836: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
838: /* perform the solve analysis on the transposed matrix */
839: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
840: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
841: loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
842: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
843: loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
845: /* assign the pointer. Is this really necessary? */
846: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
848: /*********************************************/
849: /* Now the Transpose of the Upper Tri Factor */
850: /*********************************************/
852: /* allocate space for the transpose of the upper triangular factor */
853: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
855: /* set the matrix descriptors of the upper triangular factor */
856: matrixType = cusparseGetMatType(upTriFactor->descr);
857: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
858: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
859: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
860: diagType = cusparseGetMatDiagType(upTriFactor->descr);
862: /* Create the matrix description */
863: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
864: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
865: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
866: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
867: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
869: /* Create the solve analysis information */
870: stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
872: /* set the operation */
873: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
875: /* allocate GPU space for the CSC of the upper triangular factor*/
876: upTriFactorT->csrMat = new CsrMatrix;
877: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
878: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
879: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
880: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
881: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
882: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
884: /* compute the transpose of the upper triangular factor, i.e. the CSC */
885: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
886: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
887: upTriFactor->csrMat->values->data().get(),
888: upTriFactor->csrMat->row_offsets->data().get(),
889: upTriFactor->csrMat->column_indices->data().get(),
890: upTriFactorT->csrMat->values->data().get(),
891: upTriFactorT->csrMat->column_indices->data().get(),
892: upTriFactorT->csrMat->row_offsets->data().get(),
893: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
895: /* perform the solve analysis on the transposed matrix */
896: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
897: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
898: upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
899: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
900: upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
902: /* assign the pointer. Is this really necessary? */
903: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
904: return(0);
905: }
907: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
908: {
909: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
910: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
911: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
912: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
913: cusparseStatus_t stat;
914: cusparseIndexBase_t indexBase;
915: cudaError_t err;
916: PetscErrorCode ierr;
919: if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
920: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
921: PetscLogGpuTimeBegin();
922: /* create cusparse matrix */
923: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
924: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
925: indexBase = cusparseGetMatIndexBase(matstruct->descr);
926: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
927: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
929: /* set alpha and beta */
930: err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
931: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
932: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
933: err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
934: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
935: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
936: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
938: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
939: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
940: CsrMatrix *matrixT= new CsrMatrix;
941: matrixT->num_rows = A->cmap->n;
942: matrixT->num_cols = A->rmap->n;
943: matrixT->num_entries = a->nz;
944: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
945: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
946: matrixT->values = new THRUSTARRAY(a->nz);
948: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
949: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
950: /* compute the transpose, i.e. the CSC */
951: indexBase = cusparseGetMatIndexBase(matstruct->descr);
952: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
953: A->cmap->n, matrix->num_entries,
954: matrix->values->data().get(),
955: cusparsestruct->rowoffsets_gpu->data().get(),
956: matrix->column_indices->data().get(),
957: matrixT->values->data().get(),
958: matrixT->column_indices->data().get(),
959: matrixT->row_offsets->data().get(),
960: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
961: matstructT->mat = matrixT;
962: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
963: CsrMatrix *temp = new CsrMatrix;
964: CsrMatrix *tempT = new CsrMatrix;
966: /* First convert HYB to CSR */
967: temp->num_rows = A->rmap->n;
968: temp->num_cols = A->cmap->n;
969: temp->num_entries = a->nz;
970: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
971: temp->column_indices = new THRUSTINTARRAY32(a->nz);
972: temp->values = new THRUSTARRAY(a->nz);
974: stat = cusparse_hyb2csr(cusparsestruct->handle,
975: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
976: temp->values->data().get(),
977: temp->row_offsets->data().get(),
978: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
980: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
981: tempT->num_rows = A->rmap->n;
982: tempT->num_cols = A->cmap->n;
983: tempT->num_entries = a->nz;
984: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
985: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
986: tempT->values = new THRUSTARRAY(a->nz);
988: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
989: temp->num_cols, temp->num_entries,
990: temp->values->data().get(),
991: temp->row_offsets->data().get(),
992: temp->column_indices->data().get(),
993: tempT->values->data().get(),
994: tempT->column_indices->data().get(),
995: tempT->row_offsets->data().get(),
996: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
998: /* Last, convert CSC to HYB */
999: cusparseHybMat_t hybMat;
1000: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1001: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1002: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1003: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1004: matstructT->descr, tempT->values->data().get(),
1005: tempT->row_offsets->data().get(),
1006: tempT->column_indices->data().get(),
1007: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1009: /* assign the pointer */
1010: matstructT->mat = hybMat;
1011: /* delete temporaries */
1012: if (tempT) {
1013: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1014: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1015: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1016: delete (CsrMatrix*) tempT;
1017: }
1018: if (temp) {
1019: if (temp->values) delete (THRUSTARRAY*) temp->values;
1020: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1021: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1022: delete (CsrMatrix*) temp;
1023: }
1024: }
1025: err = WaitForCUDA();CHKERRCUDA(err);
1026: PetscLogGpuTimeEnd();
1027: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1028: /* the compressed row indices is not used for matTranspose */
1029: matstructT->cprowIndices = NULL;
1030: /* assign the pointer */
1031: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1032: return(0);
1033: }
1035: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1036: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1037: {
1038: PetscInt n = xx->map->n;
1039: const PetscScalar *barray;
1040: PetscScalar *xarray;
1041: thrust::device_ptr<const PetscScalar> bGPU;
1042: thrust::device_ptr<PetscScalar> xGPU;
1043: cusparseStatus_t stat;
1044: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1045: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1046: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1047: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1048: PetscErrorCode ierr;
1049: cudaError_t cerr;
1052: /* Analyze the matrix and create the transpose ... on the fly */
1053: if (!loTriFactorT && !upTriFactorT) {
1054: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1055: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1056: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1057: }
1059: /* Get the GPU pointers */
1060: VecCUDAGetArrayWrite(xx,&xarray);
1061: VecCUDAGetArrayRead(bb,&barray);
1062: xGPU = thrust::device_pointer_cast(xarray);
1063: bGPU = thrust::device_pointer_cast(barray);
1065: PetscLogGpuTimeBegin();
1066: /* First, reorder with the row permutation */
1067: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1068: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1069: xGPU);
1071: /* First, solve U */
1072: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1073: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1074: upTriFactorT->csrMat->values->data().get(),
1075: upTriFactorT->csrMat->row_offsets->data().get(),
1076: upTriFactorT->csrMat->column_indices->data().get(),
1077: upTriFactorT->solveInfo,
1078: xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1080: /* Then, solve L */
1081: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1083: loTriFactorT->csrMat->values->data().get(),
1084: loTriFactorT->csrMat->row_offsets->data().get(),
1085: loTriFactorT->csrMat->column_indices->data().get(),
1086: loTriFactorT->solveInfo,
1087: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1089: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1090: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1091: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1092: tempGPU->begin());
1094: /* Copy the temporary to the full solution. */
1095: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1097: /* restore */
1098: VecCUDARestoreArrayRead(bb,&barray);
1099: VecCUDARestoreArrayWrite(xx,&xarray);
1100: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1101: PetscLogGpuTimeEnd();
1102: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1103: return(0);
1104: }
1106: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1107: {
1108: const PetscScalar *barray;
1109: PetscScalar *xarray;
1110: cusparseStatus_t stat;
1111: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1113: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1114: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115: PetscErrorCode ierr;
1116: cudaError_t cerr;
1119: /* Analyze the matrix and create the transpose ... on the fly */
1120: if (!loTriFactorT && !upTriFactorT) {
1121: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1122: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1123: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1124: }
1126: /* Get the GPU pointers */
1127: VecCUDAGetArrayWrite(xx,&xarray);
1128: VecCUDAGetArrayRead(bb,&barray);
1130: PetscLogGpuTimeBegin();
1131: /* First, solve U */
1132: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1133: upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1134: upTriFactorT->csrMat->values->data().get(),
1135: upTriFactorT->csrMat->row_offsets->data().get(),
1136: upTriFactorT->csrMat->column_indices->data().get(),
1137: upTriFactorT->solveInfo,
1138: barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1140: /* Then, solve L */
1141: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142: loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1143: loTriFactorT->csrMat->values->data().get(),
1144: loTriFactorT->csrMat->row_offsets->data().get(),
1145: loTriFactorT->csrMat->column_indices->data().get(),
1146: loTriFactorT->solveInfo,
1147: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1149: /* restore */
1150: VecCUDARestoreArrayRead(bb,&barray);
1151: VecCUDARestoreArrayWrite(xx,&xarray);
1152: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153: PetscLogGpuTimeEnd();
1154: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1155: return(0);
1156: }
1158: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1159: {
1160: const PetscScalar *barray;
1161: PetscScalar *xarray;
1162: thrust::device_ptr<const PetscScalar> bGPU;
1163: thrust::device_ptr<PetscScalar> xGPU;
1164: cusparseStatus_t stat;
1165: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1166: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1167: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1168: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1169: PetscErrorCode ierr;
1170: cudaError_t cerr;
1174: /* Get the GPU pointers */
1175: VecCUDAGetArrayWrite(xx,&xarray);
1176: VecCUDAGetArrayRead(bb,&barray);
1177: xGPU = thrust::device_pointer_cast(xarray);
1178: bGPU = thrust::device_pointer_cast(barray);
1180: PetscLogGpuTimeBegin();
1181: /* First, reorder with the row permutation */
1182: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1183: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1184: tempGPU->begin());
1186: /* Next, solve L */
1187: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189: loTriFactor->csrMat->values->data().get(),
1190: loTriFactor->csrMat->row_offsets->data().get(),
1191: loTriFactor->csrMat->column_indices->data().get(),
1192: loTriFactor->solveInfo,
1193: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1195: /* Then, solve U */
1196: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198: upTriFactor->csrMat->values->data().get(),
1199: upTriFactor->csrMat->row_offsets->data().get(),
1200: upTriFactor->csrMat->column_indices->data().get(),
1201: upTriFactor->solveInfo,
1202: xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1204: /* Last, reorder with the column permutation */
1205: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1206: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1207: xGPU);
1209: VecCUDARestoreArrayRead(bb,&barray);
1210: VecCUDARestoreArrayWrite(xx,&xarray);
1211: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1212: PetscLogGpuTimeEnd();
1213: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1214: return(0);
1215: }
1217: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1218: {
1219: const PetscScalar *barray;
1220: PetscScalar *xarray;
1221: cusparseStatus_t stat;
1222: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1223: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1224: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1225: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1226: PetscErrorCode ierr;
1227: cudaError_t cerr;
1230: /* Get the GPU pointers */
1231: VecCUDAGetArrayWrite(xx,&xarray);
1232: VecCUDAGetArrayRead(bb,&barray);
1234: PetscLogGpuTimeBegin();
1235: /* First, solve L */
1236: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1237: loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1238: loTriFactor->csrMat->values->data().get(),
1239: loTriFactor->csrMat->row_offsets->data().get(),
1240: loTriFactor->csrMat->column_indices->data().get(),
1241: loTriFactor->solveInfo,
1242: barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1244: /* Next, solve U */
1245: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1246: upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1247: upTriFactor->csrMat->values->data().get(),
1248: upTriFactor->csrMat->row_offsets->data().get(),
1249: upTriFactor->csrMat->column_indices->data().get(),
1250: upTriFactor->solveInfo,
1251: tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1253: VecCUDARestoreArrayRead(bb,&barray);
1254: VecCUDARestoreArrayWrite(xx,&xarray);
1255: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1256: PetscLogGpuTimeEnd();
1257: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1258: return(0);
1259: }
1261: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1262: {
1263: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1264: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1265: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1266: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1267: PetscErrorCode ierr;
1268: cusparseStatus_t stat;
1269: cudaError_t err;
1272: if (A->boundtocpu) return(0);
1273: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1274: if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275: /* Copy values only */
1276: CsrMatrix *mat,*matT;
1277: mat = (CsrMatrix*)cusparsestruct->mat->mat;
1279: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1280: PetscLogGpuTimeBegin();
1281: mat->values->assign(a->a, a->a+a->nz);
1282: err = WaitForCUDA();CHKERRCUDA(err);
1283: PetscLogGpuTimeEnd();
1284: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1285: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1287: /* Update matT when it was built before */
1288: if (cusparsestruct->matTranspose) {
1289: cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1290: matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1291: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1292: PetscLogGpuTimeBegin();
1293: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1294: A->cmap->n, mat->num_entries,
1295: mat->values->data().get(),
1296: cusparsestruct->rowoffsets_gpu->data().get(),
1297: mat->column_indices->data().get(),
1298: matT->values->data().get(),
1299: matT->column_indices->data().get(),
1300: matT->row_offsets->data().get(),
1301: CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1302: err = WaitForCUDA();CHKERRCUDA(err);
1303: PetscLogGpuTimeEnd();
1304: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1305: }
1306: } else {
1307: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1308: PetscLogGpuTimeBegin();
1309: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1310: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1311: delete cusparsestruct->workVector;
1312: delete cusparsestruct->rowoffsets_gpu;
1313: try {
1314: if (a->compressedrow.use) {
1315: m = a->compressedrow.nrows;
1316: ii = a->compressedrow.i;
1317: ridx = a->compressedrow.rindex;
1318: } else {
1319: m = A->rmap->n;
1320: ii = a->i;
1321: ridx = NULL;
1322: }
1323: cusparsestruct->nrows = m;
1325: /* create cusparse matrix */
1326: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1327: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1328: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1329: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1331: err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err);
1332: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1333: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1334: err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1335: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1336: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1337: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1339: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1340: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1341: /* set the matrix */
1342: CsrMatrix *matrix= new CsrMatrix;
1343: matrix->num_rows = m;
1344: matrix->num_cols = A->cmap->n;
1345: matrix->num_entries = a->nz;
1346: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1347: matrix->row_offsets->assign(ii, ii + m+1);
1349: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1350: matrix->column_indices->assign(a->j, a->j+a->nz);
1352: matrix->values = new THRUSTARRAY(a->nz);
1353: matrix->values->assign(a->a, a->a+a->nz);
1355: /* assign the pointer */
1356: matstruct->mat = matrix;
1358: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1359: CsrMatrix *matrix= new CsrMatrix;
1360: matrix->num_rows = m;
1361: matrix->num_cols = A->cmap->n;
1362: matrix->num_entries = a->nz;
1363: matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1364: matrix->row_offsets->assign(ii, ii + m+1);
1366: matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1367: matrix->column_indices->assign(a->j, a->j+a->nz);
1369: matrix->values = new THRUSTARRAY(a->nz);
1370: matrix->values->assign(a->a, a->a+a->nz);
1372: cusparseHybMat_t hybMat;
1373: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1374: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1375: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1376: stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1377: matstruct->descr, matrix->values->data().get(),
1378: matrix->row_offsets->data().get(),
1379: matrix->column_indices->data().get(),
1380: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1381: /* assign the pointer */
1382: matstruct->mat = hybMat;
1384: if (matrix) {
1385: if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1386: if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1387: if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1388: delete (CsrMatrix*)matrix;
1389: }
1390: }
1392: /* assign the compressed row indices */
1393: if (a->compressedrow.use) {
1394: cusparsestruct->workVector = new THRUSTARRAY(m);
1395: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1396: matstruct->cprowIndices->assign(ridx,ridx+m);
1397: tmp = m;
1398: } else {
1399: cusparsestruct->workVector = NULL;
1400: matstruct->cprowIndices = NULL;
1401: tmp = 0;
1402: }
1403: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1405: /* assign the pointer */
1406: cusparsestruct->mat = matstruct;
1407: } catch(char *ex) {
1408: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1409: }
1410: err = WaitForCUDA();CHKERRCUDA(err);
1411: PetscLogGpuTimeEnd();
1412: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1413: cusparsestruct->nonzerostate = A->nonzerostate;
1414: }
1415: A->offloadmask = PETSC_OFFLOAD_BOTH;
1416: }
1417: return(0);
1418: }
1420: struct VecCUDAPlusEquals
1421: {
1422: template <typename Tuple>
1423: __host__ __device__
1424: void operator()(Tuple t)
1425: {
1426: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1427: }
1428: };
1430: struct VecCUDAEqualsReverse
1431: {
1432: template <typename Tuple>
1433: __host__ __device__
1434: void operator()(Tuple t)
1435: {
1436: thrust::get<0>(t) = thrust::get<1>(t);
1437: }
1438: };
1440: typedef struct {
1441: PetscBool cisdense;
1442: PetscScalar *Bt;
1443: Mat X;
1444: } MatMatCusparse;
1446: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1447: {
1449: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1450: cudaError_t cerr;
1453: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1454: MatDestroy(&mmdata->X);
1455: PetscFree(data);
1456: return(0);
1457: }
1459: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1461: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1462: {
1463: Mat_Product *product = C->product;
1464: Mat A,B;
1465: PetscInt m,n,k,blda,clda;
1466: PetscBool flg,biscuda;
1467: Mat_SeqAIJCUSPARSE *cusp;
1468: cusparseStatus_t stat;
1469: cusparseOperation_t opA;
1470: cusparseMatDescr_t matA;
1471: const PetscScalar *barray;
1472: PetscScalar *carray;
1473: PetscErrorCode ierr;
1474: MatMatCusparse *mmdata;
1475: Mat_SeqAIJCUSPARSEMultStruct *mat;
1476: CsrMatrix *csrmat;
1477: cudaError_t cuer;
1480: MatCheckProduct(C,1);
1481: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1482: mmdata = (MatMatCusparse*)product->data;
1483: A = product->A;
1484: B = product->B;
1485: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1486: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1487: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1488: Instead of silently accepting the wrong answer, I prefer to raise the error */
1489: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1490: MatSeqAIJCUSPARSECopyToGPU(A);
1491: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1492: switch (product->type) {
1493: case MATPRODUCT_AB:
1494: case MATPRODUCT_PtAP:
1495: mat = cusp->mat;
1496: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1497: m = A->rmap->n;
1498: k = A->cmap->n;
1499: n = B->cmap->n;
1500: break;
1501: case MATPRODUCT_AtB:
1502: if (!cusp->transgen) {
1503: mat = cusp->mat;
1504: opA = CUSPARSE_OPERATION_TRANSPOSE;
1505: } else {
1506: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1507: mat = cusp->matTranspose;
1508: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1509: }
1510: m = A->cmap->n;
1511: k = A->rmap->n;
1512: n = B->cmap->n;
1513: break;
1514: case MATPRODUCT_ABt:
1515: case MATPRODUCT_RARt:
1516: mat = cusp->mat;
1517: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1518: m = A->rmap->n;
1519: k = B->cmap->n;
1520: n = B->rmap->n;
1521: break;
1522: default:
1523: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1524: }
1525: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1526: matA = mat->descr;
1527: csrmat = (CsrMatrix*)mat->mat;
1528: /* if the user passed a CPU matrix, copy the data to the GPU */
1529: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1530: if (!biscuda) {
1531: MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1532: }
1533: MatDenseCUDAGetArrayRead(B,&barray);
1534: MatDenseGetLDA(B,&blda);
1535: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1536: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1537: MatDenseGetLDA(mmdata->X,&clda);
1538: } else {
1539: MatDenseCUDAGetArrayWrite(C,&carray);
1540: MatDenseGetLDA(C,&clda);
1541: }
1543: PetscLogGpuTimeBegin();
1545: /* cusparse spmm does not support transpose on B */
1546: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1547: cublasHandle_t cublasv2handle;
1548: cublasStatus_t cerr;
1550: PetscCUBLASGetHandle(&cublasv2handle);
1551: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1552: B->cmap->n,B->rmap->n,
1553: &PETSC_CUSPARSE_ONE ,barray,blda,
1554: &PETSC_CUSPARSE_ZERO,barray,blda,
1555: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1556: blda = B->cmap->n;
1557: }
1559: /* perform the MatMat operation */
1560: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1561: csrmat->num_entries,mat->alpha,matA,
1562: csrmat->values->data().get(),
1563: csrmat->row_offsets->data().get(),
1564: csrmat->column_indices->data().get(),
1565: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1566: carray,clda);CHKERRCUSPARSE(stat);
1567: cuer = WaitForCUDA();CHKERRCUDA(cuer);
1568: PetscLogGpuTimeEnd();
1569: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1570: MatDenseCUDARestoreArrayRead(B,&barray);
1571: if (product->type == MATPRODUCT_RARt) {
1572: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1573: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1574: } else if (product->type == MATPRODUCT_PtAP) {
1575: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1576: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1577: } else {
1578: MatDenseCUDARestoreArrayWrite(C,&carray);
1579: }
1580: if (mmdata->cisdense) {
1581: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1582: }
1583: if (!biscuda) {
1584: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1585: }
1586: return(0);
1587: }
1589: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1590: {
1591: Mat_Product *product = C->product;
1592: Mat A,B;
1593: PetscInt m,n;
1594: PetscBool cisdense,flg;
1595: PetscErrorCode ierr;
1596: MatMatCusparse *mmdata;
1597: Mat_SeqAIJCUSPARSE *cusp;
1600: MatCheckProduct(C,1);
1601: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1602: A = product->A;
1603: B = product->B;
1604: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1605: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1606: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1607: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1608: switch (product->type) {
1609: case MATPRODUCT_AB:
1610: m = A->rmap->n;
1611: n = B->cmap->n;
1612: break;
1613: case MATPRODUCT_AtB:
1614: m = A->cmap->n;
1615: n = B->cmap->n;
1616: break;
1617: case MATPRODUCT_ABt:
1618: m = A->rmap->n;
1619: n = B->rmap->n;
1620: break;
1621: case MATPRODUCT_PtAP:
1622: m = B->cmap->n;
1623: n = B->cmap->n;
1624: break;
1625: case MATPRODUCT_RARt:
1626: m = B->rmap->n;
1627: n = B->rmap->n;
1628: break;
1629: default:
1630: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1631: }
1632: MatSetSizes(C,m,n,m,n);
1633: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1634: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1635: MatSetType(C,MATSEQDENSECUDA);
1637: /* product data */
1638: PetscNew(&mmdata);
1639: mmdata->cisdense = cisdense;
1640: /* cusparse spmm does not support transpose on B */
1641: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1642: cudaError_t cerr;
1644: cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1645: }
1646: /* for these products we need intermediate storage */
1647: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1648: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1649: MatSetType(mmdata->X,MATSEQDENSECUDA);
1650: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1651: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1652: } else {
1653: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1654: }
1655: }
1656: C->product->data = mmdata;
1657: C->product->destroy = MatDestroy_MatMatCusparse;
1659: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1660: return(0);
1661: }
1663: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1665: /* handles dense B */
1666: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1667: {
1668: Mat_Product *product = C->product;
1672: MatCheckProduct(C,1);
1673: if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1674: if (product->A->boundtocpu) {
1675: MatProductSetFromOptions_SeqAIJ_SeqDense(C);
1676: return(0);
1677: }
1678: switch (product->type) {
1679: case MATPRODUCT_AB:
1680: case MATPRODUCT_AtB:
1681: case MATPRODUCT_ABt:
1682: case MATPRODUCT_PtAP:
1683: case MATPRODUCT_RARt:
1684: C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1685: default:
1686: break;
1687: }
1688: return(0);
1689: }
1691: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1692: {
1696: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
1697: return(0);
1698: }
1700: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1701: {
1705: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
1706: return(0);
1707: }
1709: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1710: {
1714: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
1715: return(0);
1716: }
1718: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1719: {
1723: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
1724: return(0);
1725: }
1727: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1728: {
1732: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
1733: return(0);
1734: }
1736: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1737: {
1738: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1739: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1740: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1741: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
1742: PetscErrorCode ierr;
1743: cudaError_t cerr;
1744: cusparseStatus_t stat;
1745: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1746: PetscBool compressed;
1749: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1750: if (!a->nonzerorowcnt) {
1751: if (!yy) {
1752: VecSet_SeqCUDA(zz,0);
1753: }
1754: return(0);
1755: }
1756: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1757: MatSeqAIJCUSPARSECopyToGPU(A);
1758: if (!trans) {
1759: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1760: } else {
1761: if (herm || !cusparsestruct->transgen) {
1762: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1763: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1764: } else {
1765: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1766: if (!matstruct) {
1767: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1768: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1769: }
1770: }
1771: }
1772: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1773: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1775: try {
1776: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
1777: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
1778: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
1779: PetscLogGpuTimeBegin();
1780: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1781: xptr = xarray;
1782: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1783: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1784: } else {
1785: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1786: dptr = zarray;
1787: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1789: if (compressed) {
1790: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1792: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1793: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1794: VecCUDAEqualsReverse());
1795: }
1796: }
1798: /* csr_spmv does y = alpha*Ax + beta*y */
1799: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1800: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1801: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1802: mat->num_rows, mat->num_cols,
1803: mat->num_entries, matstruct->alpha, matstruct->descr,
1804: mat->values->data().get(), mat->row_offsets->data().get(),
1805: mat->column_indices->data().get(), xptr, beta,
1806: dptr);CHKERRCUSPARSE(stat);
1807: } else {
1808: if (cusparsestruct->nrows) {
1809: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1810: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1811: matstruct->alpha, matstruct->descr, hybMat,
1812: xptr, beta,
1813: dptr);CHKERRCUSPARSE(stat);
1814: }
1815: }
1816: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1817: PetscLogGpuTimeEnd();
1819: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1820: if (yy) { /* MatMultAdd: zz = A*xx + yy */
1821: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1822: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
1823: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1824: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
1825: }
1826: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1827: VecSet_SeqCUDA(zz,0);
1828: }
1830: /* ScatterAdd the result from work vector into the full vector when A is compressed */
1831: if (compressed) {
1832: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1834: PetscLogGpuTimeBegin();
1835: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1836: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1837: VecCUDAPlusEquals());
1838: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1839: PetscLogGpuTimeEnd();
1840: }
1841: } else {
1842: if (yy && yy != zz) {
1843: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
1844: }
1845: }
1846: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
1847: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
1848: else {VecCUDARestoreArrayWrite(zz,&zarray);}
1849: } catch(char *ex) {
1850: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1851: }
1852: if (yy) {
1853: PetscLogGpuFlops(2.0*a->nz);
1854: } else {
1855: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
1856: }
1857: return(0);
1858: }
1860: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1861: {
1865: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
1866: return(0);
1867: }
1869: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1870: {
1874: MatAssemblyEnd_SeqAIJ(A,mode);
1875: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
1876: if (A->factortype == MAT_FACTOR_NONE) {
1877: MatSeqAIJCUSPARSECopyToGPU(A);
1878: }
1879: return(0);
1880: }
1882: /* --------------------------------------------------------------------------------*/
1883: /*@
1884: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1885: (the default parallel PETSc format). This matrix will ultimately pushed down
1886: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1887: assembly performance the user should preallocate the matrix storage by setting
1888: the parameter nz (or the array nnz). By setting these parameters accurately,
1889: performance during matrix assembly can be increased by more than a factor of 50.
1891: Collective
1893: Input Parameters:
1894: + comm - MPI communicator, set to PETSC_COMM_SELF
1895: . m - number of rows
1896: . n - number of columns
1897: . nz - number of nonzeros per row (same for all rows)
1898: - nnz - array containing the number of nonzeros in the various rows
1899: (possibly different for each row) or NULL
1901: Output Parameter:
1902: . A - the matrix
1904: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1905: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1906: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1908: Notes:
1909: If nnz is given then nz is ignored
1911: The AIJ format (also called the Yale sparse matrix format or
1912: compressed row storage), is fully compatible with standard Fortran 77
1913: storage. That is, the stored row and column indices can begin at
1914: either one (as in Fortran) or zero. See the users' manual for details.
1916: Specify the preallocated storage with either nz or nnz (not both).
1917: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1918: allocation. For large problems you MUST preallocate memory or you
1919: will get TERRIBLE performance, see the users' manual chapter on matrices.
1921: By default, this format uses inodes (identical nodes) when possible, to
1922: improve numerical efficiency of matrix-vector products and solves. We
1923: search for consecutive rows with the same nonzero structure, thereby
1924: reusing matrix information to achieve increased efficiency.
1926: Level: intermediate
1928: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1929: @*/
1930: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1931: {
1935: MatCreate(comm,A);
1936: MatSetSizes(*A,m,n,m,n);
1937: MatSetType(*A,MATSEQAIJCUSPARSE);
1938: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1939: return(0);
1940: }
1942: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1943: {
1947: if (A->factortype == MAT_FACTOR_NONE) {
1948: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
1949: } else {
1950: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
1951: }
1952: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
1953: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
1954: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
1955: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
1956: MatDestroy_SeqAIJ(A);
1957: return(0);
1958: }
1960: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1961: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1962: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1963: {
1967: MatDuplicate_SeqAIJ(A,cpvalues,B);
1968: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
1969: return(0);
1970: }
1972: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1973: {
1977: if (A->factortype != MAT_FACTOR_NONE) return(0);
1978: /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1979: If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1980: Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1981: TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1982: can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1983: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1984: /* TODO: add support for this? */
1985: if (flg) {
1986: A->ops->mult = MatMult_SeqAIJ;
1987: A->ops->multadd = MatMultAdd_SeqAIJ;
1988: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
1989: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1990: A->ops->multhermitiantranspose = NULL;
1991: A->ops->multhermitiantransposeadd = NULL;
1992: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
1993: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
1994: } else {
1995: A->ops->mult = MatMult_SeqAIJCUSPARSE;
1996: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
1997: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
1998: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1999: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2000: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2001: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2002: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2003: }
2004: A->boundtocpu = flg;
2005: return(0);
2006: }
2008: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2009: {
2010: PetscErrorCode ierr;
2011: cusparseStatus_t stat;
2012: Mat B;
2015: if (reuse == MAT_INITIAL_MATRIX) {
2016: MatDuplicate(A,MAT_COPY_VALUES,newmat);
2017: } else if (reuse == MAT_REUSE_MATRIX) {
2018: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2019: }
2020: B = *newmat;
2022: PetscFree(B->defaultvectype);
2023: PetscStrallocpy(VECCUDA,&B->defaultvectype);
2025: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2026: if (B->factortype == MAT_FACTOR_NONE) {
2027: Mat_SeqAIJCUSPARSE *spptr;
2029: PetscNew(&spptr);
2030: spptr->format = MAT_CUSPARSE_CSR;
2031: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2032: B->spptr = spptr;
2033: } else {
2034: Mat_SeqAIJCUSPARSETriFactors *spptr;
2036: PetscNew(&spptr);
2037: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2038: B->spptr = spptr;
2039: }
2040: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2041: }
2042: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2043: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2044: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2045: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2046: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2048: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2049: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2050: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2051: return(0);
2052: }
2054: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2055: {
2059: PetscCUDAInitializeCheck();
2060: MatCreate_SeqAIJ(B);
2061: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2062: return(0);
2063: }
2065: /*MC
2066: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2068: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2069: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2070: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2072: Options Database Keys:
2073: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2074: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2075: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2077: Level: beginner
2079: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2080: M*/
2082: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2085: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2086: {
2090: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2091: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2092: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2093: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2094: return(0);
2095: }
2097: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2098: {
2099: PetscErrorCode ierr;
2100: cusparseStatus_t stat;
2101: cusparseHandle_t handle;
2104: if (*cusparsestruct) {
2105: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2106: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2107: delete (*cusparsestruct)->workVector;
2108: delete (*cusparsestruct)->rowoffsets_gpu;
2109: if (handle = (*cusparsestruct)->handle) {
2110: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2111: }
2112: PetscFree(*cusparsestruct);
2113: }
2114: return(0);
2115: }
2117: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2118: {
2120: if (*mat) {
2121: delete (*mat)->values;
2122: delete (*mat)->column_indices;
2123: delete (*mat)->row_offsets;
2124: delete *mat;
2125: *mat = 0;
2126: }
2127: return(0);
2128: }
2130: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2131: {
2132: cusparseStatus_t stat;
2133: PetscErrorCode ierr;
2136: if (*trifactor) {
2137: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2138: if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2139: CsrMatrix_Destroy(&(*trifactor)->csrMat);
2140: delete *trifactor;
2141: *trifactor = 0;
2142: }
2143: return(0);
2144: }
2146: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2147: {
2148: CsrMatrix *mat;
2149: cusparseStatus_t stat;
2150: cudaError_t err;
2153: if (*matstruct) {
2154: if ((*matstruct)->mat) {
2155: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2156: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2157: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2158: } else {
2159: mat = (CsrMatrix*)(*matstruct)->mat;
2160: CsrMatrix_Destroy(&mat);
2161: }
2162: }
2163: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2164: delete (*matstruct)->cprowIndices;
2165: if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2166: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2167: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2168: delete *matstruct;
2169: *matstruct = 0;
2170: }
2171: return(0);
2172: }
2174: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2175: {
2179: if (*trifactors) {
2180: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2181: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2182: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2183: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2184: delete (*trifactors)->rpermIndices;
2185: delete (*trifactors)->cpermIndices;
2186: delete (*trifactors)->workVector;
2187: (*trifactors)->rpermIndices = 0;
2188: (*trifactors)->cpermIndices = 0;
2189: (*trifactors)->workVector = 0;
2190: }
2191: return(0);
2192: }
2194: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2195: {
2196: PetscErrorCode ierr;
2197: cusparseHandle_t handle;
2198: cusparseStatus_t stat;
2201: if (*trifactors) {
2202: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2203: if (handle = (*trifactors)->handle) {
2204: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2205: }
2206: PetscFree(*trifactors);
2207: }
2208: return(0);
2209: }