Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #include <thrust/async/for_each.h>
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/remove.h>
22: #include <thrust/sort.h>
23: #include <thrust/unique.h>
25: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30: typedef enum {
31: CUSPARSE_MV_ALG_DEFAULT = 0,
32: CUSPARSE_COOMV_ALG = 1,
33: CUSPARSE_CSRMV_ALG1 = 2,
34: CUSPARSE_CSRMV_ALG2 = 3
35: } cusparseSpMVAlg_t;
37: typedef enum {
38: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
40: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
41: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
42: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
43: CUSPARSE_SPMM_ALG_DEFAULT = 0,
44: CUSPARSE_SPMM_COO_ALG1 = 1,
45: CUSPARSE_SPMM_COO_ALG2 = 2,
46: CUSPARSE_SPMM_COO_ALG3 = 3,
47: CUSPARSE_SPMM_COO_ALG4 = 5,
48: CUSPARSE_SPMM_CSR_ALG1 = 4,
49: CUSPARSE_SPMM_CSR_ALG2 = 6,
50: } cusparseSpMMAlg_t;
52: typedef enum {
53: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
54: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
55: } cusparseCsr2CscAlg_t;
56: */
57: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60: #endif
62: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
91: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99: {
100: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
102: PetscFunctionBegin;
103: switch (op) {
104: case MAT_CUSPARSE_MULT:
105: cusparsestruct->format = format;
106: break;
107: case MAT_CUSPARSE_ALL:
108: cusparsestruct->format = format;
109: break;
110: default:
111: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112: }
113: PetscFunctionReturn(PETSC_SUCCESS);
114: }
116: /*@
117: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118: operation. Only the `MatMult()` operation can use different GPU storage formats
120: Not Collective
122: Input Parameters:
123: + A - Matrix of type `MATSEQAIJCUSPARSE`
124: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128: Level: intermediate
130: .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131: @*/
132: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133: {
134: PetscFunctionBegin;
136: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141: {
142: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
144: PetscFunctionBegin;
145: cusparsestruct->use_cpu_solve = use_cpu;
146: PetscFunctionReturn(PETSC_SUCCESS);
147: }
149: /*@
150: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
152: Input Parameters:
153: + A - Matrix of type `MATSEQAIJCUSPARSE`
154: - use_cpu - set flag for using the built-in CPU `MatSolve()`
156: Level: intermediate
158: Note:
159: The cuSparse LU solver currently computes the factors with the built-in CPU method
160: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
163: .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164: @*/
165: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166: {
167: PetscFunctionBegin;
169: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170: PetscFunctionReturn(PETSC_SUCCESS);
171: }
173: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174: {
175: PetscFunctionBegin;
176: switch (op) {
177: case MAT_FORM_EXPLICIT_TRANSPOSE:
178: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180: A->form_explicit_transpose = flg;
181: break;
182: default:
183: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184: break;
185: }
186: PetscFunctionReturn(PETSC_SUCCESS);
187: }
189: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
191: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192: {
193: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
194: IS isrow = b->row, iscol = b->col;
195: PetscBool row_identity, col_identity;
196: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
198: PetscFunctionBegin;
199: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
201: B->offloadmask = PETSC_OFFLOAD_CPU;
202: /* determine which version of MatSolve needs to be used. */
203: PetscCall(ISIdentity(isrow, &row_identity));
204: PetscCall(ISIdentity(iscol, &col_identity));
206: if (!cusparsestruct->use_cpu_solve) {
207: if (row_identity && col_identity) {
208: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210: } else {
211: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213: }
214: }
215: B->ops->matsolve = NULL;
216: B->ops->matsolvetranspose = NULL;
218: /* get the triangular factors */
219: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
224: {
225: MatCUSPARSEStorageFormat format;
226: PetscBool flg;
227: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
229: PetscFunctionBegin;
230: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
231: if (A->factortype == MAT_FACTOR_NONE) {
232: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
233: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
235: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
236: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
237: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
238: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
239: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
240: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
241: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
242: #if CUSPARSE_VERSION > 11301
243: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
244: #else
245: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
246: #endif
247: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
248: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
250: PetscCall(
251: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
252: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
253: #endif
254: }
255: PetscOptionsHeadEnd();
256: PetscFunctionReturn(PETSC_SUCCESS);
257: }
259: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
260: {
261: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
262: PetscInt n = A->rmap->n;
263: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
264: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
265: const PetscInt *ai = a->i, *aj = a->j, *vi;
266: const MatScalar *aa = a->a, *v;
267: PetscInt *AiLo, *AjLo;
268: PetscInt i, nz, nzLower, offset, rowOffset;
270: PetscFunctionBegin;
271: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
272: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
273: try {
274: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
275: nzLower = n + ai[n] - ai[1];
276: if (!loTriFactor) {
277: PetscScalar *AALo;
279: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
281: /* Allocate Space for the lower triangular matrix */
282: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
283: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
285: /* Fill the lower triangular matrix */
286: AiLo[0] = (PetscInt)0;
287: AiLo[n] = nzLower;
288: AjLo[0] = (PetscInt)0;
289: AALo[0] = (MatScalar)1.0;
290: v = aa;
291: vi = aj;
292: offset = 1;
293: rowOffset = 1;
294: for (i = 1; i < n; i++) {
295: nz = ai[i + 1] - ai[i];
296: /* additional 1 for the term on the diagonal */
297: AiLo[i] = rowOffset;
298: rowOffset += nz + 1;
300: PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
301: PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
303: offset += nz;
304: AjLo[offset] = (PetscInt)i;
305: AALo[offset] = (MatScalar)1.0;
306: offset += 1;
308: v += nz;
309: vi += nz;
310: }
312: /* allocate space for the triangular factor information */
313: PetscCall(PetscNew(&loTriFactor));
314: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
315: /* Create the matrix description */
316: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
317: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
318: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
319: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
320: #else
321: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
322: #endif
323: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
324: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
326: /* set the operation */
327: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329: /* set the matrix */
330: loTriFactor->csrMat = new CsrMatrix;
331: loTriFactor->csrMat->num_rows = n;
332: loTriFactor->csrMat->num_cols = n;
333: loTriFactor->csrMat->num_entries = nzLower;
335: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
336: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
338: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
341: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
344: /* Create the solve analysis information */
345: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
346: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
347: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
348: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
349: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
350: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
351: #endif
353: /* perform the solve analysis */
354: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
355: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
356: PetscCallCUDA(WaitForCUDA());
357: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
359: /* assign the pointer */
360: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
361: loTriFactor->AA_h = AALo;
362: PetscCallCUDA(cudaFreeHost(AiLo));
363: PetscCallCUDA(cudaFreeHost(AjLo));
364: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
365: } else { /* update values only */
366: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
367: /* Fill the lower triangular matrix */
368: loTriFactor->AA_h[0] = 1.0;
369: v = aa;
370: vi = aj;
371: offset = 1;
372: for (i = 1; i < n; i++) {
373: nz = ai[i + 1] - ai[i];
374: PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
375: offset += nz;
376: loTriFactor->AA_h[offset] = 1.0;
377: offset += 1;
378: v += nz;
379: }
380: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
381: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
382: }
383: } catch (char *ex) {
384: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
385: }
386: }
387: PetscFunctionReturn(PETSC_SUCCESS);
388: }
390: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
391: {
392: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
393: PetscInt n = A->rmap->n;
394: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
395: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
396: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
397: const MatScalar *aa = a->a, *v;
398: PetscInt *AiUp, *AjUp;
399: PetscInt i, nz, nzUpper, offset;
401: PetscFunctionBegin;
402: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
403: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
404: try {
405: /* next, figure out the number of nonzeros in the upper triangular matrix. */
406: nzUpper = adiag[0] - adiag[n];
407: if (!upTriFactor) {
408: PetscScalar *AAUp;
410: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
412: /* Allocate Space for the upper triangular matrix */
413: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
414: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
416: /* Fill the upper triangular matrix */
417: AiUp[0] = (PetscInt)0;
418: AiUp[n] = nzUpper;
419: offset = nzUpper;
420: for (i = n - 1; i >= 0; i--) {
421: v = aa + adiag[i + 1] + 1;
422: vi = aj + adiag[i + 1] + 1;
424: /* number of elements NOT on the diagonal */
425: nz = adiag[i] - adiag[i + 1] - 1;
427: /* decrement the offset */
428: offset -= (nz + 1);
430: /* first, set the diagonal elements */
431: AjUp[offset] = (PetscInt)i;
432: AAUp[offset] = (MatScalar)1. / v[nz];
433: AiUp[i] = AiUp[i + 1] - (nz + 1);
435: PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
436: PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
437: }
439: /* allocate space for the triangular factor information */
440: PetscCall(PetscNew(&upTriFactor));
441: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
443: /* Create the matrix description */
444: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
445: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
446: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
447: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
448: #else
449: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
450: #endif
451: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
452: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
454: /* set the operation */
455: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
457: /* set the matrix */
458: upTriFactor->csrMat = new CsrMatrix;
459: upTriFactor->csrMat->num_rows = n;
460: upTriFactor->csrMat->num_cols = n;
461: upTriFactor->csrMat->num_entries = nzUpper;
463: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
464: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
466: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
467: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
469: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
470: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
472: /* Create the solve analysis information */
473: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
474: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
475: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
476: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
477: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
478: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
479: #endif
481: /* perform the solve analysis */
482: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
483: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
485: PetscCallCUDA(WaitForCUDA());
486: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
488: /* assign the pointer */
489: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
490: upTriFactor->AA_h = AAUp;
491: PetscCallCUDA(cudaFreeHost(AiUp));
492: PetscCallCUDA(cudaFreeHost(AjUp));
493: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
494: } else {
495: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
496: /* Fill the upper triangular matrix */
497: offset = nzUpper;
498: for (i = n - 1; i >= 0; i--) {
499: v = aa + adiag[i + 1] + 1;
501: /* number of elements NOT on the diagonal */
502: nz = adiag[i] - adiag[i + 1] - 1;
504: /* decrement the offset */
505: offset -= (nz + 1);
507: /* first, set the diagonal elements */
508: upTriFactor->AA_h[offset] = 1. / v[nz];
509: PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
510: }
511: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
512: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
513: }
514: } catch (char *ex) {
515: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
516: }
517: }
518: PetscFunctionReturn(PETSC_SUCCESS);
519: }
521: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
522: {
523: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
524: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
525: IS isrow = a->row, iscol = a->icol;
526: PetscBool row_identity, col_identity;
527: PetscInt n = A->rmap->n;
529: PetscFunctionBegin;
530: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
531: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
532: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
534: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
535: cusparseTriFactors->nnz = a->nz;
537: A->offloadmask = PETSC_OFFLOAD_BOTH;
538: /* lower triangular indices */
539: PetscCall(ISIdentity(isrow, &row_identity));
540: if (!row_identity && !cusparseTriFactors->rpermIndices) {
541: const PetscInt *r;
543: PetscCall(ISGetIndices(isrow, &r));
544: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545: cusparseTriFactors->rpermIndices->assign(r, r + n);
546: PetscCall(ISRestoreIndices(isrow, &r));
547: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548: }
550: /* upper triangular indices */
551: PetscCall(ISIdentity(iscol, &col_identity));
552: if (!col_identity && !cusparseTriFactors->cpermIndices) {
553: const PetscInt *c;
555: PetscCall(ISGetIndices(iscol, &c));
556: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
557: cusparseTriFactors->cpermIndices->assign(c, c + n);
558: PetscCall(ISRestoreIndices(iscol, &c));
559: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
560: }
561: PetscFunctionReturn(PETSC_SUCCESS);
562: }
564: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
565: {
566: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
567: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
568: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
569: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
570: PetscInt *AiUp, *AjUp;
571: PetscScalar *AAUp;
572: PetscScalar *AALo;
573: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
574: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
575: const PetscInt *ai = b->i, *aj = b->j, *vj;
576: const MatScalar *aa = b->a, *v;
578: PetscFunctionBegin;
579: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
580: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
581: try {
582: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
583: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
584: if (!upTriFactor && !loTriFactor) {
585: /* Allocate Space for the upper triangular matrix */
586: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
587: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
589: /* Fill the upper triangular matrix */
590: AiUp[0] = (PetscInt)0;
591: AiUp[n] = nzUpper;
592: offset = 0;
593: for (i = 0; i < n; i++) {
594: /* set the pointers */
595: v = aa + ai[i];
596: vj = aj + ai[i];
597: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
599: /* first, set the diagonal elements */
600: AjUp[offset] = (PetscInt)i;
601: AAUp[offset] = (MatScalar)1.0 / v[nz];
602: AiUp[i] = offset;
603: AALo[offset] = (MatScalar)1.0 / v[nz];
605: offset += 1;
606: if (nz > 0) {
607: PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
608: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
609: for (j = offset; j < offset + nz; j++) {
610: AAUp[j] = -AAUp[j];
611: AALo[j] = AAUp[j] / v[nz];
612: }
613: offset += nz;
614: }
615: }
617: /* allocate space for the triangular factor information */
618: PetscCall(PetscNew(&upTriFactor));
619: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
621: /* Create the matrix description */
622: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
623: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
624: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
625: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
626: #else
627: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
628: #endif
629: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
630: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
632: /* set the matrix */
633: upTriFactor->csrMat = new CsrMatrix;
634: upTriFactor->csrMat->num_rows = A->rmap->n;
635: upTriFactor->csrMat->num_cols = A->cmap->n;
636: upTriFactor->csrMat->num_entries = a->nz;
638: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
639: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
641: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
642: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
644: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
645: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
647: /* set the operation */
648: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
650: /* Create the solve analysis information */
651: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
652: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
653: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
654: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
656: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
657: #endif
659: /* perform the solve analysis */
660: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
661: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
663: PetscCallCUDA(WaitForCUDA());
664: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
666: /* assign the pointer */
667: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
669: /* allocate space for the triangular factor information */
670: PetscCall(PetscNew(&loTriFactor));
671: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
673: /* Create the matrix description */
674: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
675: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
676: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
677: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
678: #else
679: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
680: #endif
681: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
682: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
684: /* set the operation */
685: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
687: /* set the matrix */
688: loTriFactor->csrMat = new CsrMatrix;
689: loTriFactor->csrMat->num_rows = A->rmap->n;
690: loTriFactor->csrMat->num_cols = A->cmap->n;
691: loTriFactor->csrMat->num_entries = a->nz;
693: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
694: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
696: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
697: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
699: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
700: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
702: /* Create the solve analysis information */
703: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
704: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
705: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
706: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
708: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
709: #endif
711: /* perform the solve analysis */
712: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
713: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
715: PetscCallCUDA(WaitForCUDA());
716: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
718: /* assign the pointer */
719: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
721: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
722: PetscCallCUDA(cudaFreeHost(AiUp));
723: PetscCallCUDA(cudaFreeHost(AjUp));
724: } else {
725: /* Fill the upper triangular matrix */
726: offset = 0;
727: for (i = 0; i < n; i++) {
728: /* set the pointers */
729: v = aa + ai[i];
730: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
732: /* first, set the diagonal elements */
733: AAUp[offset] = 1.0 / v[nz];
734: AALo[offset] = 1.0 / v[nz];
736: offset += 1;
737: if (nz > 0) {
738: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739: for (j = offset; j < offset + nz; j++) {
740: AAUp[j] = -AAUp[j];
741: AALo[j] = AAUp[j] / v[nz];
742: }
743: offset += nz;
744: }
745: }
746: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
747: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
750: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751: }
752: PetscCallCUDA(cudaFreeHost(AAUp));
753: PetscCallCUDA(cudaFreeHost(AALo));
754: } catch (char *ex) {
755: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
756: }
757: }
758: PetscFunctionReturn(PETSC_SUCCESS);
759: }
761: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
762: {
763: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
764: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
765: IS ip = a->row;
766: PetscBool perm_identity;
767: PetscInt n = A->rmap->n;
769: PetscFunctionBegin;
770: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
771: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
772: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
773: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
775: A->offloadmask = PETSC_OFFLOAD_BOTH;
777: /* lower triangular indices */
778: PetscCall(ISIdentity(ip, &perm_identity));
779: if (!perm_identity) {
780: IS iip;
781: const PetscInt *irip, *rip;
783: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
784: PetscCall(ISGetIndices(iip, &irip));
785: PetscCall(ISGetIndices(ip, &rip));
786: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
787: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
788: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
789: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
790: PetscCall(ISRestoreIndices(iip, &irip));
791: PetscCall(ISDestroy(&iip));
792: PetscCall(ISRestoreIndices(ip, &rip));
793: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
794: }
795: PetscFunctionReturn(PETSC_SUCCESS);
796: }
798: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
799: {
800: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
801: IS ip = b->row;
802: PetscBool perm_identity;
804: PetscFunctionBegin;
805: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
806: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
807: B->offloadmask = PETSC_OFFLOAD_CPU;
808: /* determine which version of MatSolve needs to be used. */
809: PetscCall(ISIdentity(ip, &perm_identity));
810: if (perm_identity) {
811: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
812: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
813: B->ops->matsolve = NULL;
814: B->ops->matsolvetranspose = NULL;
815: } else {
816: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
817: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
818: B->ops->matsolve = NULL;
819: B->ops->matsolvetranspose = NULL;
820: }
822: /* get the triangular factors */
823: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
824: PetscFunctionReturn(PETSC_SUCCESS);
825: }
827: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
828: {
829: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
830: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
831: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
832: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
833: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
834: cusparseIndexBase_t indexBase;
835: cusparseMatrixType_t matrixType;
836: cusparseFillMode_t fillMode;
837: cusparseDiagType_t diagType;
839: PetscFunctionBegin;
840: /* allocate space for the transpose of the lower triangular factor */
841: PetscCall(PetscNew(&loTriFactorT));
842: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
844: /* set the matrix descriptors of the lower triangular factor */
845: matrixType = cusparseGetMatType(loTriFactor->descr);
846: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
847: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
848: diagType = cusparseGetMatDiagType(loTriFactor->descr);
850: /* Create the matrix description */
851: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
852: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
853: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
854: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
855: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
857: /* set the operation */
858: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
860: /* allocate GPU space for the CSC of the lower triangular factor*/
861: loTriFactorT->csrMat = new CsrMatrix;
862: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
863: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
864: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
865: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
866: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
867: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
869: /* compute the transpose of the lower triangular factor, i.e. the CSC */
870: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
871: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
872: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
873: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
874: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
875: #endif
877: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878: {
879: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
880: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
881: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
882: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
883: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
884: #else
885: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
886: #endif
887: PetscCallCUSPARSE(stat);
888: }
890: PetscCallCUDA(WaitForCUDA());
891: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
893: /* Create the solve analysis information */
894: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
896: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
897: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
898: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
899: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
900: #endif
902: /* perform the solve analysis */
903: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
904: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
906: PetscCallCUDA(WaitForCUDA());
907: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
909: /* assign the pointer */
910: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
912: /*********************************************/
913: /* Now the Transpose of the Upper Tri Factor */
914: /*********************************************/
916: /* allocate space for the transpose of the upper triangular factor */
917: PetscCall(PetscNew(&upTriFactorT));
918: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
920: /* set the matrix descriptors of the upper triangular factor */
921: matrixType = cusparseGetMatType(upTriFactor->descr);
922: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
923: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924: diagType = cusparseGetMatDiagType(upTriFactor->descr);
926: /* Create the matrix description */
927: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
928: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
929: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
930: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
931: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
933: /* set the operation */
934: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
936: /* allocate GPU space for the CSC of the upper triangular factor*/
937: upTriFactorT->csrMat = new CsrMatrix;
938: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
939: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
940: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
941: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
942: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
943: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
945: /* compute the transpose of the upper triangular factor, i.e. the CSC */
946: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
947: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
948: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
949: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
950: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
951: #endif
953: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954: {
955: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
956: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
957: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
958: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
959: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
960: #else
961: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
962: #endif
963: PetscCallCUSPARSE(stat);
964: }
966: PetscCallCUDA(WaitForCUDA());
967: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
969: /* Create the solve analysis information */
970: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
971: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
972: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
973: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
974: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
975: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
976: #endif
978: /* perform the solve analysis */
979: /* christ, would it have killed you to put this stuff in a function????????? */
980: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
981: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
983: PetscCallCUDA(WaitForCUDA());
984: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
986: /* assign the pointer */
987: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
988: PetscFunctionReturn(PETSC_SUCCESS);
989: }
991: struct PetscScalarToPetscInt {
992: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
993: };
995: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
996: {
997: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
998: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
999: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1000: cusparseStatus_t stat;
1001: cusparseIndexBase_t indexBase;
1003: PetscFunctionBegin;
1004: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1005: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1006: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1007: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1008: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1009: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1010: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1011: PetscCall(PetscLogGpuTimeBegin());
1012: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1013: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1014: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1015: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1016: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1017: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1018: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1020: /* set alpha and beta */
1021: PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1022: PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1023: PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1024: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1025: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1026: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1028: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1029: CsrMatrix *matrixT = new CsrMatrix;
1030: matstructT->mat = matrixT;
1031: matrixT->num_rows = A->cmap->n;
1032: matrixT->num_cols = A->rmap->n;
1033: matrixT->num_entries = a->nz;
1034: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1035: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1036: matrixT->values = new THRUSTARRAY(a->nz);
1038: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1039: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1041: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1042: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1043: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1044: indexBase, cusparse_scalartype);
1045: PetscCallCUSPARSE(stat);
1046: #else
1047: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1048: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1050: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1051: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1052: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1053: */
1054: if (matrixT->num_entries) {
1055: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1056: PetscCallCUSPARSE(stat);
1058: } else {
1059: matstructT->matDescr = NULL;
1060: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1061: }
1062: #endif
1063: #endif
1064: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1065: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1066: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1067: #else
1068: CsrMatrix *temp = new CsrMatrix;
1069: CsrMatrix *tempT = new CsrMatrix;
1070: /* First convert HYB to CSR */
1071: temp->num_rows = A->rmap->n;
1072: temp->num_cols = A->cmap->n;
1073: temp->num_entries = a->nz;
1074: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1076: temp->values = new THRUSTARRAY(a->nz);
1078: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1079: PetscCallCUSPARSE(stat);
1081: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1082: tempT->num_rows = A->rmap->n;
1083: tempT->num_cols = A->cmap->n;
1084: tempT->num_entries = a->nz;
1085: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1086: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1087: tempT->values = new THRUSTARRAY(a->nz);
1089: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1090: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1091: PetscCallCUSPARSE(stat);
1093: /* Last, convert CSC to HYB */
1094: cusparseHybMat_t hybMat;
1095: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1096: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1097: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1098: PetscCallCUSPARSE(stat);
1100: /* assign the pointer */
1101: matstructT->mat = hybMat;
1102: A->transupdated = PETSC_TRUE;
1103: /* delete temporaries */
1104: if (tempT) {
1105: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1106: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1107: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1108: delete (CsrMatrix *)tempT;
1109: }
1110: if (temp) {
1111: if (temp->values) delete (THRUSTARRAY *)temp->values;
1112: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1113: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1114: delete (CsrMatrix *)temp;
1115: }
1116: #endif
1117: }
1118: }
1119: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1120: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1121: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1122: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1123: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1124: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1125: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1126: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1127: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1128: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1129: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1130: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1131: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1132: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1133: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1134: }
1135: if (!cusparsestruct->csr2csc_i) {
1136: THRUSTARRAY csr2csc_a(matrix->num_entries);
1137: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1139: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1140: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1141: void *csr2cscBuffer;
1142: size_t csr2cscBufferSize;
1143: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1144: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1145: PetscCallCUSPARSE(stat);
1146: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1147: #endif
1149: if (matrix->num_entries) {
1150: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1151: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1152: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1154: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1155: should be filled with indexBase. So I just take a shortcut here.
1156: */
1157: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1158: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1159: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1160: PetscCallCUSPARSE(stat);
1161: #else
1162: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1163: PetscCallCUSPARSE(stat);
1164: #endif
1165: } else {
1166: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1167: }
1169: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1170: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1171: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172: PetscCallCUDA(cudaFree(csr2cscBuffer));
1173: #endif
1174: }
1175: PetscCallThrust(
1176: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1177: }
1178: PetscCall(PetscLogGpuTimeEnd());
1179: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1180: /* the compressed row indices is not used for matTranspose */
1181: matstructT->cprowIndices = NULL;
1182: /* assign the pointer */
1183: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1184: A->transupdated = PETSC_TRUE;
1185: PetscFunctionReturn(PETSC_SUCCESS);
1186: }
1188: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1189: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1190: {
1191: PetscInt n = xx->map->n;
1192: const PetscScalar *barray;
1193: PetscScalar *xarray;
1194: thrust::device_ptr<const PetscScalar> bGPU;
1195: thrust::device_ptr<PetscScalar> xGPU;
1196: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1197: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1198: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1199: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1201: PetscFunctionBegin;
1202: /* Analyze the matrix and create the transpose ... on the fly */
1203: if (!loTriFactorT && !upTriFactorT) {
1204: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1205: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1206: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1207: }
1209: /* Get the GPU pointers */
1210: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1211: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1212: xGPU = thrust::device_pointer_cast(xarray);
1213: bGPU = thrust::device_pointer_cast(barray);
1215: PetscCall(PetscLogGpuTimeBegin());
1216: /* First, reorder with the row permutation */
1217: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1219: /* First, solve U */
1220: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1223: /* Then, solve L */
1224: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1225: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1227: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1228: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1230: /* Copy the temporary to the full solution. */
1231: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1233: /* restore */
1234: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1235: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1236: PetscCall(PetscLogGpuTimeEnd());
1237: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1238: PetscFunctionReturn(PETSC_SUCCESS);
1239: }
1241: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1242: {
1243: const PetscScalar *barray;
1244: PetscScalar *xarray;
1245: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1246: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1247: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1248: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1250: PetscFunctionBegin;
1251: /* Analyze the matrix and create the transpose ... on the fly */
1252: if (!loTriFactorT && !upTriFactorT) {
1253: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1254: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1255: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1256: }
1258: /* Get the GPU pointers */
1259: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1260: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1262: PetscCall(PetscLogGpuTimeBegin());
1263: /* First, solve U */
1264: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1265: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1267: /* Then, solve L */
1268: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1269: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1271: /* restore */
1272: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1273: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1274: PetscCall(PetscLogGpuTimeEnd());
1275: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1276: PetscFunctionReturn(PETSC_SUCCESS);
1277: }
1279: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1280: {
1281: const PetscScalar *barray;
1282: PetscScalar *xarray;
1283: thrust::device_ptr<const PetscScalar> bGPU;
1284: thrust::device_ptr<PetscScalar> xGPU;
1285: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1286: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1287: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1288: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1290: PetscFunctionBegin;
1291: /* Get the GPU pointers */
1292: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1293: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1294: xGPU = thrust::device_pointer_cast(xarray);
1295: bGPU = thrust::device_pointer_cast(barray);
1297: PetscCall(PetscLogGpuTimeBegin());
1298: /* First, reorder with the row permutation */
1299: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1301: /* Next, solve L */
1302: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1303: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1305: /* Then, solve U */
1306: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1307: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1309: /* Last, reorder with the column permutation */
1310: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1312: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1313: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1314: PetscCall(PetscLogGpuTimeEnd());
1315: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1316: PetscFunctionReturn(PETSC_SUCCESS);
1317: }
1319: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1320: {
1321: const PetscScalar *barray;
1322: PetscScalar *xarray;
1323: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1324: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1325: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1326: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1328: PetscFunctionBegin;
1329: /* Get the GPU pointers */
1330: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1331: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1333: PetscCall(PetscLogGpuTimeBegin());
1334: /* First, solve L */
1335: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1336: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1338: /* Next, solve U */
1339: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1340: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1342: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1343: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1344: PetscCall(PetscLogGpuTimeEnd());
1345: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1346: PetscFunctionReturn(PETSC_SUCCESS);
1347: }
1349: #if CUSPARSE_VERSION >= 11500
1350: /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1351: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1352: {
1353: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1354: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1355: const PetscScalar *barray;
1356: PetscScalar *xarray;
1358: PetscFunctionBegin;
1359: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1360: PetscCall(VecCUDAGetArrayRead(b, &barray));
1361: PetscCall(PetscLogGpuTimeBegin());
1363: /* Solve L*y = b */
1364: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1365: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1366: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1367: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1368: fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1370: /* Solve U*x = y */
1371: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1372: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1373: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1375: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1376: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1378: PetscCall(PetscLogGpuTimeEnd());
1379: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1380: PetscFunctionReturn(PETSC_SUCCESS);
1381: }
1383: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1384: {
1385: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1386: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1387: const PetscScalar *barray;
1388: PetscScalar *xarray;
1390: PetscFunctionBegin;
1391: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1392: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1393: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1394: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1396: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1397: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1398: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1399: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1400: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1401: }
1403: if (!fs->updatedTransposeSpSVAnalysis) {
1404: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1406: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1407: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1408: }
1410: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1411: PetscCall(VecCUDAGetArrayRead(b, &barray));
1412: PetscCall(PetscLogGpuTimeBegin());
1414: /* Solve Ut*y = b */
1415: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1416: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1417: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1418: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1420: /* Solve Lt*x = y */
1421: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1422: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1423: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1425: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1426: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1427: PetscCall(PetscLogGpuTimeEnd());
1428: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1429: PetscFunctionReturn(PETSC_SUCCESS);
1430: }
1432: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1433: {
1434: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1435: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1436: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1437: CsrMatrix *Acsr;
1438: PetscInt m, nz;
1439: PetscBool flg;
1441: PetscFunctionBegin;
1442: if (PetscDefined(USE_DEBUG)) {
1443: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1444: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1445: }
1447: /* Copy A's value to fact */
1448: m = fact->rmap->n;
1449: nz = aij->nz;
1450: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1451: Acsr = (CsrMatrix *)Acusp->mat->mat;
1452: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1454: /* Factorize fact inplace */
1455: if (m)
1456: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1457: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1458: if (PetscDefined(USE_DEBUG)) {
1459: int numerical_zero;
1460: cusparseStatus_t status;
1461: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1462: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1463: }
1465: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1466: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1467: */
1468: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1470: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1472: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1473: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1475: fact->offloadmask = PETSC_OFFLOAD_GPU;
1476: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0;
1477: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1478: fact->ops->matsolve = NULL;
1479: fact->ops->matsolvetranspose = NULL;
1480: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1481: PetscFunctionReturn(PETSC_SUCCESS);
1482: }
1484: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1485: {
1486: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1487: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1488: PetscInt m, nz;
1490: PetscFunctionBegin;
1491: if (PetscDefined(USE_DEBUG)) {
1492: PetscInt i;
1493: PetscBool flg, missing;
1495: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1496: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1497: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1498: PetscCall(MatMissingDiagonal(A, &missing, &i));
1499: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1500: }
1502: /* Free the old stale stuff */
1503: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1505: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1506: but they will not be used. Allocate them just for easy debugging.
1507: */
1508: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1510: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1511: fact->factortype = MAT_FACTOR_ILU;
1512: fact->info.factor_mallocs = 0;
1513: fact->info.fill_ratio_given = info->fill;
1514: fact->info.fill_ratio_needed = 1.0;
1516: aij->row = NULL;
1517: aij->col = NULL;
1519: /* ====================================================================== */
1520: /* Copy A's i, j to fact and also allocate the value array of fact. */
1521: /* We'll do in-place factorization on fact */
1522: /* ====================================================================== */
1523: const int *Ai, *Aj;
1525: m = fact->rmap->n;
1526: nz = aij->nz;
1528: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1529: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1530: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1531: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1532: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1533: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1535: /* ====================================================================== */
1536: /* Create descriptors for M, L, U */
1537: /* ====================================================================== */
1538: cusparseFillMode_t fillMode;
1539: cusparseDiagType_t diagType;
1541: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1542: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1543: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1545: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1546: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1547: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1548: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1549: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1550: */
1551: fillMode = CUSPARSE_FILL_MODE_LOWER;
1552: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1553: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1554: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1555: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1557: fillMode = CUSPARSE_FILL_MODE_UPPER;
1558: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1559: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1560: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1561: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1563: /* ========================================================================= */
1564: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1565: /* ========================================================================= */
1566: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1567: if (m)
1568: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1569: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1571: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1572: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1574: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1575: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1577: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1578: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1580: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1581: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1583: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1584: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1585: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1586: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1587: */
1588: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1589: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1590: fs->spsvBuffer_L = fs->factBuffer_M;
1591: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1592: } else {
1593: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1594: fs->spsvBuffer_U = fs->factBuffer_M;
1595: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1596: }
1598: /* ========================================================================== */
1599: /* Perform analysis of ilu0 on M, SpSv on L and U */
1600: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1601: /* ========================================================================== */
1602: int structural_zero;
1603: cusparseStatus_t status;
1605: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1606: if (m)
1607: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1608: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1609: if (PetscDefined(USE_DEBUG)) {
1610: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1611: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1612: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1613: }
1615: /* Estimate FLOPs of the numeric factorization */
1616: {
1617: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1618: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1619: PetscLogDouble flops = 0.0;
1621: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1622: Ai = Aseq->i;
1623: Adiag = Aseq->diag;
1624: for (PetscInt i = 0; i < m; i++) {
1625: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1626: nzRow = Ai[i + 1] - Ai[i];
1627: nzLeft = Adiag[i] - Ai[i];
1628: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1629: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1630: */
1631: nzLeft = (nzRow - 1) / 2;
1632: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1633: }
1634: }
1635: fs->numericFactFlops = flops;
1636: }
1637: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1638: PetscFunctionReturn(PETSC_SUCCESS);
1639: }
1641: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1642: {
1643: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1644: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1645: const PetscScalar *barray;
1646: PetscScalar *xarray;
1648: PetscFunctionBegin;
1649: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1650: PetscCall(VecCUDAGetArrayRead(b, &barray));
1651: PetscCall(PetscLogGpuTimeBegin());
1653: /* Solve L*y = b */
1654: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1655: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1656: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1657: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1659: /* Solve Lt*x = y */
1660: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1661: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1662: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1664: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1665: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1667: PetscCall(PetscLogGpuTimeEnd());
1668: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1669: PetscFunctionReturn(PETSC_SUCCESS);
1670: }
1672: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1673: {
1674: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1675: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1676: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1677: CsrMatrix *Acsr;
1678: PetscInt m, nz;
1679: PetscBool flg;
1681: PetscFunctionBegin;
1682: if (PetscDefined(USE_DEBUG)) {
1683: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1684: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1685: }
1687: /* Copy A's value to fact */
1688: m = fact->rmap->n;
1689: nz = aij->nz;
1690: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1691: Acsr = (CsrMatrix *)Acusp->mat->mat;
1692: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1694: /* Factorize fact inplace */
1695: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1696: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1697: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1698: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1699: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1700: */
1701: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1702: if (PetscDefined(USE_DEBUG)) {
1703: int numerical_zero;
1704: cusparseStatus_t status;
1705: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1706: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1707: }
1709: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1711: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1712: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1713: */
1714: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1716: fact->offloadmask = PETSC_OFFLOAD_GPU;
1717: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1718: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1719: fact->ops->matsolve = NULL;
1720: fact->ops->matsolvetranspose = NULL;
1721: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1722: PetscFunctionReturn(PETSC_SUCCESS);
1723: }
1725: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1726: {
1727: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1728: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1729: PetscInt m, nz;
1731: PetscFunctionBegin;
1732: if (PetscDefined(USE_DEBUG)) {
1733: PetscInt i;
1734: PetscBool flg, missing;
1736: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1737: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1738: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1739: PetscCall(MatMissingDiagonal(A, &missing, &i));
1740: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1741: }
1743: /* Free the old stale stuff */
1744: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1746: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1747: but they will not be used. Allocate them just for easy debugging.
1748: */
1749: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1751: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1752: fact->factortype = MAT_FACTOR_ICC;
1753: fact->info.factor_mallocs = 0;
1754: fact->info.fill_ratio_given = info->fill;
1755: fact->info.fill_ratio_needed = 1.0;
1757: aij->row = NULL;
1758: aij->col = NULL;
1760: /* ====================================================================== */
1761: /* Copy A's i, j to fact and also allocate the value array of fact. */
1762: /* We'll do in-place factorization on fact */
1763: /* ====================================================================== */
1764: const int *Ai, *Aj;
1766: m = fact->rmap->n;
1767: nz = aij->nz;
1769: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1770: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1771: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1772: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1773: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1774: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1776: /* ====================================================================== */
1777: /* Create mat descriptors for M, L */
1778: /* ====================================================================== */
1779: cusparseFillMode_t fillMode;
1780: cusparseDiagType_t diagType;
1782: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1783: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1784: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1786: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1787: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1788: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1789: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1790: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1791: */
1792: fillMode = CUSPARSE_FILL_MODE_LOWER;
1793: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1794: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1795: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1796: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1798: /* ========================================================================= */
1799: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1800: /* ========================================================================= */
1801: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1802: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1804: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1805: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1807: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1808: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1810: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1811: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1813: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1814: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1816: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1817: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1818: */
1819: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1820: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1821: fs->spsvBuffer_L = fs->factBuffer_M;
1822: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1823: } else {
1824: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1825: fs->spsvBuffer_Lt = fs->factBuffer_M;
1826: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1827: }
1829: /* ========================================================================== */
1830: /* Perform analysis of ic0 on M */
1831: /* The lower triangular part of M has the same sparsity pattern as L */
1832: /* ========================================================================== */
1833: int structural_zero;
1834: cusparseStatus_t status;
1836: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1837: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1838: if (PetscDefined(USE_DEBUG)) {
1839: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1840: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1841: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1842: }
1844: /* Estimate FLOPs of the numeric factorization */
1845: {
1846: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1847: PetscInt *Ai, nzRow, nzLeft;
1848: PetscLogDouble flops = 0.0;
1850: Ai = Aseq->i;
1851: for (PetscInt i = 0; i < m; i++) {
1852: nzRow = Ai[i + 1] - Ai[i];
1853: if (nzRow > 1) {
1854: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1855: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1856: */
1857: nzLeft = (nzRow - 1) / 2;
1858: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1859: }
1860: }
1861: fs->numericFactFlops = flops;
1862: }
1863: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1864: PetscFunctionReturn(PETSC_SUCCESS);
1865: }
1866: #endif
1868: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1869: {
1870: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1872: PetscFunctionBegin;
1873: #if CUSPARSE_VERSION >= 11500
1874: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1875: if (cusparseTriFactors->factorizeOnDevice) {
1876: PetscCall(ISIdentity(isrow, &row_identity));
1877: PetscCall(ISIdentity(iscol, &col_identity));
1878: }
1879: if (!info->levels && row_identity && col_identity) {
1880: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1881: } else
1882: #endif
1883: {
1884: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1885: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1886: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1887: }
1888: PetscFunctionReturn(PETSC_SUCCESS);
1889: }
1891: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892: {
1893: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1895: PetscFunctionBegin;
1896: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1897: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1899: PetscFunctionReturn(PETSC_SUCCESS);
1900: }
1902: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1903: {
1904: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1906: PetscFunctionBegin;
1907: #if CUSPARSE_VERSION >= 11500
1908: PetscBool perm_identity = PETSC_FALSE;
1909: if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1910: if (!info->levels && perm_identity) {
1911: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1912: } else
1913: #endif
1914: {
1915: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1916: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1917: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1918: }
1919: PetscFunctionReturn(PETSC_SUCCESS);
1920: }
1922: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1923: {
1924: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1926: PetscFunctionBegin;
1927: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1928: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1929: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1930: PetscFunctionReturn(PETSC_SUCCESS);
1931: }
1933: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1934: {
1935: PetscFunctionBegin;
1936: *type = MATSOLVERCUSPARSE;
1937: PetscFunctionReturn(PETSC_SUCCESS);
1938: }
1940: /*MC
1941: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1942: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1943: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1944: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1945: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1946: algorithms are not recommended. This class does NOT support direct solver operations.
1948: Level: beginner
1950: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1951: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1952: M*/
1954: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1955: {
1956: PetscInt n = A->rmap->n;
1957: PetscBool factOnDevice, factOnHost;
1958: char *prefix;
1959: char factPlace[32] = "device"; /* the default */
1961: PetscFunctionBegin;
1962: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1963: PetscCall(MatSetSizes(*B, n, n, n, n));
1964: (*B)->factortype = ftype;
1965: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1967: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1968: PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1969: PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1970: PetscOptionsEnd();
1971: PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1972: PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1973: PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1974: ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1976: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1977: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1978: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1979: if (!A->boundtocpu) {
1980: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1981: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1982: } else {
1983: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1984: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1985: }
1986: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1987: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1988: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1989: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1990: if (!A->boundtocpu) {
1991: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1992: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1993: } else {
1994: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1995: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1996: }
1997: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1998: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1999: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2001: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2002: (*B)->canuseordering = PETSC_TRUE;
2003: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2004: PetscFunctionReturn(PETSC_SUCCESS);
2005: }
2007: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2008: {
2009: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2010: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2011: #if CUSPARSE_VERSION >= 13500
2012: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2013: #endif
2015: PetscFunctionBegin;
2016: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2017: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2018: if (A->factortype == MAT_FACTOR_NONE) {
2019: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2020: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2021: }
2022: #if CUSPARSE_VERSION >= 13500
2023: else if (fs->csrVal) {
2024: /* We have a factorized matrix on device and are able to copy it to host */
2025: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2026: }
2027: #endif
2028: else
2029: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2030: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2031: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2032: A->offloadmask = PETSC_OFFLOAD_BOTH;
2033: }
2034: PetscFunctionReturn(PETSC_SUCCESS);
2035: }
2037: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2038: {
2039: PetscFunctionBegin;
2040: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2041: *array = ((Mat_SeqAIJ *)A->data)->a;
2042: PetscFunctionReturn(PETSC_SUCCESS);
2043: }
2045: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2046: {
2047: PetscFunctionBegin;
2048: A->offloadmask = PETSC_OFFLOAD_CPU;
2049: *array = NULL;
2050: PetscFunctionReturn(PETSC_SUCCESS);
2051: }
2053: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2054: {
2055: PetscFunctionBegin;
2056: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2057: *array = ((Mat_SeqAIJ *)A->data)->a;
2058: PetscFunctionReturn(PETSC_SUCCESS);
2059: }
2061: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2062: {
2063: PetscFunctionBegin;
2064: *array = NULL;
2065: PetscFunctionReturn(PETSC_SUCCESS);
2066: }
2068: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2069: {
2070: PetscFunctionBegin;
2071: *array = ((Mat_SeqAIJ *)A->data)->a;
2072: PetscFunctionReturn(PETSC_SUCCESS);
2073: }
2075: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2076: {
2077: PetscFunctionBegin;
2078: A->offloadmask = PETSC_OFFLOAD_CPU;
2079: *array = NULL;
2080: PetscFunctionReturn(PETSC_SUCCESS);
2081: }
2083: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2084: {
2085: Mat_SeqAIJCUSPARSE *cusp;
2086: CsrMatrix *matrix;
2088: PetscFunctionBegin;
2089: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2090: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2091: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2092: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2093: matrix = (CsrMatrix *)cusp->mat->mat;
2095: if (i) {
2096: #if !defined(PETSC_USE_64BIT_INDICES)
2097: *i = matrix->row_offsets->data().get();
2098: #else
2099: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2100: #endif
2101: }
2102: if (j) {
2103: #if !defined(PETSC_USE_64BIT_INDICES)
2104: *j = matrix->column_indices->data().get();
2105: #else
2106: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2107: #endif
2108: }
2109: if (a) *a = matrix->values->data().get();
2110: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2111: PetscFunctionReturn(PETSC_SUCCESS);
2112: }
2114: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2115: {
2116: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2117: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2118: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2119: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2120: cusparseStatus_t stat;
2121: PetscBool both = PETSC_TRUE;
2123: PetscFunctionBegin;
2124: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2125: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2126: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2127: CsrMatrix *matrix;
2128: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2130: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2131: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2132: matrix->values->assign(a->a, a->a + a->nz);
2133: PetscCallCUDA(WaitForCUDA());
2134: PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2135: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2137: } else {
2138: PetscInt nnz;
2139: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2140: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2141: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2142: delete cusparsestruct->workVector;
2143: delete cusparsestruct->rowoffsets_gpu;
2144: cusparsestruct->workVector = NULL;
2145: cusparsestruct->rowoffsets_gpu = NULL;
2146: try {
2147: if (a->compressedrow.use) {
2148: m = a->compressedrow.nrows;
2149: ii = a->compressedrow.i;
2150: ridx = a->compressedrow.rindex;
2151: } else {
2152: m = A->rmap->n;
2153: ii = a->i;
2154: ridx = NULL;
2155: }
2156: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2157: if (!a->a) {
2158: nnz = ii[m];
2159: both = PETSC_FALSE;
2160: } else nnz = a->nz;
2161: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2163: /* create cusparse matrix */
2164: cusparsestruct->nrows = m;
2165: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2166: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2167: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2168: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2170: PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2171: PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2172: PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2173: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2174: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2175: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2176: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2178: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2179: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2180: /* set the matrix */
2181: CsrMatrix *mat = new CsrMatrix;
2182: mat->num_rows = m;
2183: mat->num_cols = A->cmap->n;
2184: mat->num_entries = nnz;
2185: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2186: mat->row_offsets->assign(ii, ii + m + 1);
2188: mat->column_indices = new THRUSTINTARRAY32(nnz);
2189: mat->column_indices->assign(a->j, a->j + nnz);
2191: mat->values = new THRUSTARRAY(nnz);
2192: if (a->a) mat->values->assign(a->a, a->a + nnz);
2194: /* assign the pointer */
2195: matstruct->mat = mat;
2196: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2197: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2198: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2199: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2200: PetscCallCUSPARSE(stat);
2201: }
2202: #endif
2203: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2204: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2205: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2206: #else
2207: CsrMatrix *mat = new CsrMatrix;
2208: mat->num_rows = m;
2209: mat->num_cols = A->cmap->n;
2210: mat->num_entries = nnz;
2211: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2212: mat->row_offsets->assign(ii, ii + m + 1);
2214: mat->column_indices = new THRUSTINTARRAY32(nnz);
2215: mat->column_indices->assign(a->j, a->j + nnz);
2217: mat->values = new THRUSTARRAY(nnz);
2218: if (a->a) mat->values->assign(a->a, a->a + nnz);
2220: cusparseHybMat_t hybMat;
2221: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2222: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2223: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2224: PetscCallCUSPARSE(stat);
2225: /* assign the pointer */
2226: matstruct->mat = hybMat;
2228: if (mat) {
2229: if (mat->values) delete (THRUSTARRAY *)mat->values;
2230: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2231: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2232: delete (CsrMatrix *)mat;
2233: }
2234: #endif
2235: }
2237: /* assign the compressed row indices */
2238: if (a->compressedrow.use) {
2239: cusparsestruct->workVector = new THRUSTARRAY(m);
2240: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2241: matstruct->cprowIndices->assign(ridx, ridx + m);
2242: tmp = m;
2243: } else {
2244: cusparsestruct->workVector = NULL;
2245: matstruct->cprowIndices = NULL;
2246: tmp = 0;
2247: }
2248: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2250: /* assign the pointer */
2251: cusparsestruct->mat = matstruct;
2252: } catch (char *ex) {
2253: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2254: }
2255: PetscCallCUDA(WaitForCUDA());
2256: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2257: cusparsestruct->nonzerostate = A->nonzerostate;
2258: }
2259: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2260: }
2261: PetscFunctionReturn(PETSC_SUCCESS);
2262: }
2264: struct VecCUDAPlusEquals {
2265: template <typename Tuple>
2266: __host__ __device__ void operator()(Tuple t)
2267: {
2268: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2269: }
2270: };
2272: struct VecCUDAEquals {
2273: template <typename Tuple>
2274: __host__ __device__ void operator()(Tuple t)
2275: {
2276: thrust::get<1>(t) = thrust::get<0>(t);
2277: }
2278: };
2280: struct VecCUDAEqualsReverse {
2281: template <typename Tuple>
2282: __host__ __device__ void operator()(Tuple t)
2283: {
2284: thrust::get<0>(t) = thrust::get<1>(t);
2285: }
2286: };
2288: struct MatMatCusparse {
2289: PetscBool cisdense;
2290: PetscScalar *Bt;
2291: Mat X;
2292: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2293: PetscLogDouble flops;
2294: CsrMatrix *Bcsr;
2296: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2297: cusparseSpMatDescr_t matSpBDescr;
2298: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2299: cusparseDnMatDescr_t matBDescr;
2300: cusparseDnMatDescr_t matCDescr;
2301: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2302: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2303: void *dBuffer4;
2304: void *dBuffer5;
2305: #endif
2306: size_t mmBufferSize;
2307: void *mmBuffer;
2308: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2309: cusparseSpGEMMDescr_t spgemmDesc;
2310: #endif
2311: };
2313: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2314: {
2315: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2317: PetscFunctionBegin;
2318: PetscCallCUDA(cudaFree(mmdata->Bt));
2319: delete mmdata->Bcsr;
2320: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2321: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2322: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2323: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2324: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2325: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2327: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2328: #endif
2329: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2330: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2331: #endif
2332: PetscCall(MatDestroy(&mmdata->X));
2333: PetscCall(PetscFree(data));
2334: PetscFunctionReturn(PETSC_SUCCESS);
2335: }
2337: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2339: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2340: {
2341: Mat_Product *product = C->product;
2342: Mat A, B;
2343: PetscInt m, n, blda, clda;
2344: PetscBool flg, biscuda;
2345: Mat_SeqAIJCUSPARSE *cusp;
2346: cusparseStatus_t stat;
2347: cusparseOperation_t opA;
2348: const PetscScalar *barray;
2349: PetscScalar *carray;
2350: MatMatCusparse *mmdata;
2351: Mat_SeqAIJCUSPARSEMultStruct *mat;
2352: CsrMatrix *csrmat;
2354: PetscFunctionBegin;
2355: MatCheckProduct(C, 1);
2356: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2357: mmdata = (MatMatCusparse *)product->data;
2358: A = product->A;
2359: B = product->B;
2360: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2361: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2362: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2363: Instead of silently accepting the wrong answer, I prefer to raise the error */
2364: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2365: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2366: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2367: switch (product->type) {
2368: case MATPRODUCT_AB:
2369: case MATPRODUCT_PtAP:
2370: mat = cusp->mat;
2371: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2372: m = A->rmap->n;
2373: n = B->cmap->n;
2374: break;
2375: case MATPRODUCT_AtB:
2376: if (!A->form_explicit_transpose) {
2377: mat = cusp->mat;
2378: opA = CUSPARSE_OPERATION_TRANSPOSE;
2379: } else {
2380: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2381: mat = cusp->matTranspose;
2382: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2383: }
2384: m = A->cmap->n;
2385: n = B->cmap->n;
2386: break;
2387: case MATPRODUCT_ABt:
2388: case MATPRODUCT_RARt:
2389: mat = cusp->mat;
2390: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2391: m = A->rmap->n;
2392: n = B->rmap->n;
2393: break;
2394: default:
2395: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2396: }
2397: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2398: csrmat = (CsrMatrix *)mat->mat;
2399: /* if the user passed a CPU matrix, copy the data to the GPU */
2400: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2401: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2402: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2404: PetscCall(MatDenseGetLDA(B, &blda));
2405: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2406: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2407: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2408: } else {
2409: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2410: PetscCall(MatDenseGetLDA(C, &clda));
2411: }
2413: PetscCall(PetscLogGpuTimeBegin());
2414: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2415: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2416: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2417: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2418: size_t mmBufferSize;
2419: if (mmdata->initialized && mmdata->Blda != blda) {
2420: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2421: mmdata->matBDescr = NULL;
2422: }
2423: if (!mmdata->matBDescr) {
2424: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2425: mmdata->Blda = blda;
2426: }
2428: if (mmdata->initialized && mmdata->Clda != clda) {
2429: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2430: mmdata->matCDescr = NULL;
2431: }
2432: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2433: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2434: mmdata->Clda = clda;
2435: }
2437: if (!mat->matDescr) {
2438: stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2439: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2440: PetscCallCUSPARSE(stat);
2441: }
2442: stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2443: PetscCallCUSPARSE(stat);
2444: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2445: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2446: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2447: mmdata->mmBufferSize = mmBufferSize;
2448: }
2449: mmdata->initialized = PETSC_TRUE;
2450: } else {
2451: /* to be safe, always update pointers of the mats */
2452: PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2453: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2454: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2455: }
2457: /* do cusparseSpMM, which supports transpose on B */
2458: stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2459: PetscCallCUSPARSE(stat);
2460: #else
2461: PetscInt k;
2462: /* cusparseXcsrmm does not support transpose on B */
2463: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2464: cublasHandle_t cublasv2handle;
2465: cublasStatus_t cerr;
2467: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2468: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2469: PetscCallCUBLAS(cerr);
2470: blda = B->cmap->n;
2471: k = B->cmap->n;
2472: } else {
2473: k = B->rmap->n;
2474: }
2476: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2477: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2478: PetscCallCUSPARSE(stat);
2479: #endif
2480: PetscCall(PetscLogGpuTimeEnd());
2481: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2482: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2483: if (product->type == MATPRODUCT_RARt) {
2484: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2485: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2486: } else if (product->type == MATPRODUCT_PtAP) {
2487: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2488: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2489: } else {
2490: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2491: }
2492: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2493: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2494: PetscFunctionReturn(PETSC_SUCCESS);
2495: }
2497: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2498: {
2499: Mat_Product *product = C->product;
2500: Mat A, B;
2501: PetscInt m, n;
2502: PetscBool cisdense, flg;
2503: MatMatCusparse *mmdata;
2504: Mat_SeqAIJCUSPARSE *cusp;
2506: PetscFunctionBegin;
2507: MatCheckProduct(C, 1);
2508: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2509: A = product->A;
2510: B = product->B;
2511: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2512: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2513: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2514: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2515: switch (product->type) {
2516: case MATPRODUCT_AB:
2517: m = A->rmap->n;
2518: n = B->cmap->n;
2519: break;
2520: case MATPRODUCT_AtB:
2521: m = A->cmap->n;
2522: n = B->cmap->n;
2523: break;
2524: case MATPRODUCT_ABt:
2525: m = A->rmap->n;
2526: n = B->rmap->n;
2527: break;
2528: case MATPRODUCT_PtAP:
2529: m = B->cmap->n;
2530: n = B->cmap->n;
2531: break;
2532: case MATPRODUCT_RARt:
2533: m = B->rmap->n;
2534: n = B->rmap->n;
2535: break;
2536: default:
2537: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2538: }
2539: PetscCall(MatSetSizes(C, m, n, m, n));
2540: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2541: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2542: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2544: /* product data */
2545: PetscCall(PetscNew(&mmdata));
2546: mmdata->cisdense = cisdense;
2547: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2548: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2549: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2550: #endif
2551: /* for these products we need intermediate storage */
2552: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2553: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2554: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2555: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2556: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2557: } else {
2558: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2559: }
2560: }
2561: C->product->data = mmdata;
2562: C->product->destroy = MatDestroy_MatMatCusparse;
2564: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2565: PetscFunctionReturn(PETSC_SUCCESS);
2566: }
2568: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2569: {
2570: Mat_Product *product = C->product;
2571: Mat A, B;
2572: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2573: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2574: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2575: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2576: PetscBool flg;
2577: cusparseStatus_t stat;
2578: MatProductType ptype;
2579: MatMatCusparse *mmdata;
2580: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2581: cusparseSpMatDescr_t BmatSpDescr;
2582: #endif
2583: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2585: PetscFunctionBegin;
2586: MatCheckProduct(C, 1);
2587: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2588: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2589: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2590: mmdata = (MatMatCusparse *)C->product->data;
2591: A = product->A;
2592: B = product->B;
2593: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2594: mmdata->reusesym = PETSC_FALSE;
2595: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2596: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2597: Cmat = Ccusp->mat;
2598: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2599: Ccsr = (CsrMatrix *)Cmat->mat;
2600: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2601: goto finalize;
2602: }
2603: if (!c->nz) goto finalize;
2604: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2605: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2606: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2607: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2608: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2609: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2610: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2611: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2612: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2613: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2614: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2615: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2616: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2617: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2619: ptype = product->type;
2620: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2621: ptype = MATPRODUCT_AB;
2622: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2623: }
2624: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2625: ptype = MATPRODUCT_AB;
2626: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2627: }
2628: switch (ptype) {
2629: case MATPRODUCT_AB:
2630: Amat = Acusp->mat;
2631: Bmat = Bcusp->mat;
2632: break;
2633: case MATPRODUCT_AtB:
2634: Amat = Acusp->matTranspose;
2635: Bmat = Bcusp->mat;
2636: break;
2637: case MATPRODUCT_ABt:
2638: Amat = Acusp->mat;
2639: Bmat = Bcusp->matTranspose;
2640: break;
2641: default:
2642: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2643: }
2644: Cmat = Ccusp->mat;
2645: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2646: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2647: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2648: Acsr = (CsrMatrix *)Amat->mat;
2649: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2650: Ccsr = (CsrMatrix *)Cmat->mat;
2651: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2652: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2653: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2654: PetscCall(PetscLogGpuTimeBegin());
2655: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2656: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2657: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2658: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2659: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2660: PetscCallCUSPARSE(stat);
2661: #else
2662: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2663: PetscCallCUSPARSE(stat);
2664: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2665: PetscCallCUSPARSE(stat);
2666: #endif
2667: #else
2668: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2669: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2670: PetscCallCUSPARSE(stat);
2671: #endif
2672: PetscCall(PetscLogGpuFlops(mmdata->flops));
2673: PetscCallCUDA(WaitForCUDA());
2674: PetscCall(PetscLogGpuTimeEnd());
2675: C->offloadmask = PETSC_OFFLOAD_GPU;
2676: finalize:
2677: /* shorter version of MatAssemblyEnd_SeqAIJ */
2678: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2679: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2680: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2681: c->reallocs = 0;
2682: C->info.mallocs += 0;
2683: C->info.nz_unneeded = 0;
2684: C->assembled = C->was_assembled = PETSC_TRUE;
2685: C->num_ass++;
2686: PetscFunctionReturn(PETSC_SUCCESS);
2687: }
2689: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2690: {
2691: Mat_Product *product = C->product;
2692: Mat A, B;
2693: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2694: Mat_SeqAIJ *a, *b, *c;
2695: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2696: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2697: PetscInt i, j, m, n, k;
2698: PetscBool flg;
2699: cusparseStatus_t stat;
2700: MatProductType ptype;
2701: MatMatCusparse *mmdata;
2702: PetscLogDouble flops;
2703: PetscBool biscompressed, ciscompressed;
2704: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2705: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2706: cusparseSpMatDescr_t BmatSpDescr;
2707: #else
2708: int cnz;
2709: #endif
2710: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2712: PetscFunctionBegin;
2713: MatCheckProduct(C, 1);
2714: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2715: A = product->A;
2716: B = product->B;
2717: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2718: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2719: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2720: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2721: a = (Mat_SeqAIJ *)A->data;
2722: b = (Mat_SeqAIJ *)B->data;
2723: /* product data */
2724: PetscCall(PetscNew(&mmdata));
2725: C->product->data = mmdata;
2726: C->product->destroy = MatDestroy_MatMatCusparse;
2728: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2729: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2730: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2731: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2732: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2733: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2735: ptype = product->type;
2736: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2737: ptype = MATPRODUCT_AB;
2738: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2739: }
2740: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2741: ptype = MATPRODUCT_AB;
2742: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2743: }
2744: biscompressed = PETSC_FALSE;
2745: ciscompressed = PETSC_FALSE;
2746: switch (ptype) {
2747: case MATPRODUCT_AB:
2748: m = A->rmap->n;
2749: n = B->cmap->n;
2750: k = A->cmap->n;
2751: Amat = Acusp->mat;
2752: Bmat = Bcusp->mat;
2753: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2754: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2755: break;
2756: case MATPRODUCT_AtB:
2757: m = A->cmap->n;
2758: n = B->cmap->n;
2759: k = A->rmap->n;
2760: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2761: Amat = Acusp->matTranspose;
2762: Bmat = Bcusp->mat;
2763: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2764: break;
2765: case MATPRODUCT_ABt:
2766: m = A->rmap->n;
2767: n = B->rmap->n;
2768: k = A->cmap->n;
2769: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2770: Amat = Acusp->mat;
2771: Bmat = Bcusp->matTranspose;
2772: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2773: break;
2774: default:
2775: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2776: }
2778: /* create cusparse matrix */
2779: PetscCall(MatSetSizes(C, m, n, m, n));
2780: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2781: c = (Mat_SeqAIJ *)C->data;
2782: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2783: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2784: Ccsr = new CsrMatrix;
2786: c->compressedrow.use = ciscompressed;
2787: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2788: c->compressedrow.nrows = a->compressedrow.nrows;
2789: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2790: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2791: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2792: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2793: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2794: } else {
2795: c->compressedrow.nrows = 0;
2796: c->compressedrow.i = NULL;
2797: c->compressedrow.rindex = NULL;
2798: Ccusp->workVector = NULL;
2799: Cmat->cprowIndices = NULL;
2800: }
2801: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2802: Ccusp->mat = Cmat;
2803: Ccusp->mat->mat = Ccsr;
2804: Ccsr->num_rows = Ccusp->nrows;
2805: Ccsr->num_cols = n;
2806: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2807: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2808: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2809: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2810: PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2811: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2812: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2813: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2814: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2815: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2816: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2817: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2818: c->nz = 0;
2819: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2820: Ccsr->values = new THRUSTARRAY(c->nz);
2821: goto finalizesym;
2822: }
2824: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2825: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2826: Acsr = (CsrMatrix *)Amat->mat;
2827: if (!biscompressed) {
2828: Bcsr = (CsrMatrix *)Bmat->mat;
2829: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2830: BmatSpDescr = Bmat->matDescr;
2831: #endif
2832: } else { /* we need to use row offsets for the full matrix */
2833: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2834: Bcsr = new CsrMatrix;
2835: Bcsr->num_rows = B->rmap->n;
2836: Bcsr->num_cols = cBcsr->num_cols;
2837: Bcsr->num_entries = cBcsr->num_entries;
2838: Bcsr->column_indices = cBcsr->column_indices;
2839: Bcsr->values = cBcsr->values;
2840: if (!Bcusp->rowoffsets_gpu) {
2841: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2842: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2843: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2844: }
2845: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2846: mmdata->Bcsr = Bcsr;
2847: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2848: if (Bcsr->num_rows && Bcsr->num_cols) {
2849: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2850: PetscCallCUSPARSE(stat);
2851: }
2852: BmatSpDescr = mmdata->matSpBDescr;
2853: #endif
2854: }
2855: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2856: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2857: /* precompute flops count */
2858: if (ptype == MATPRODUCT_AB) {
2859: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2860: const PetscInt st = a->i[i];
2861: const PetscInt en = a->i[i + 1];
2862: for (j = st; j < en; j++) {
2863: const PetscInt brow = a->j[j];
2864: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2865: }
2866: }
2867: } else if (ptype == MATPRODUCT_AtB) {
2868: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2869: const PetscInt anzi = a->i[i + 1] - a->i[i];
2870: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2871: flops += (2. * anzi) * bnzi;
2872: }
2873: } else { /* TODO */
2874: flops = 0.;
2875: }
2877: mmdata->flops = flops;
2878: PetscCall(PetscLogGpuTimeBegin());
2880: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2881: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2882: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2883: PetscCallCUSPARSE(stat);
2884: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2885: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2886: {
2887: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2888: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2889: */
2890: void *dBuffer1 = NULL;
2891: void *dBuffer2 = NULL;
2892: void *dBuffer3 = NULL;
2893: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2894: size_t bufferSize1 = 0;
2895: size_t bufferSize2 = 0;
2896: size_t bufferSize3 = 0;
2897: size_t bufferSize4 = 0;
2898: size_t bufferSize5 = 0;
2900: /* ask bufferSize1 bytes for external memory */
2901: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2902: PetscCallCUSPARSE(stat);
2903: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2904: /* inspect the matrices A and B to understand the memory requirement for the next step */
2905: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2906: PetscCallCUSPARSE(stat);
2908: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2909: PetscCallCUSPARSE(stat);
2910: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2911: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2912: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2913: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2914: PetscCallCUSPARSE(stat);
2915: PetscCallCUDA(cudaFree(dBuffer1));
2916: PetscCallCUDA(cudaFree(dBuffer2));
2918: /* get matrix C non-zero entries C_nnz1 */
2919: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2920: c->nz = (PetscInt)C_nnz1;
2921: /* allocate matrix C */
2922: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2923: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2924: Ccsr->values = new THRUSTARRAY(c->nz);
2925: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2926: /* update matC with the new pointers */
2927: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2928: PetscCallCUSPARSE(stat);
2930: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2931: PetscCallCUSPARSE(stat);
2932: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2933: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2934: PetscCallCUSPARSE(stat);
2935: PetscCallCUDA(cudaFree(dBuffer3));
2936: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2937: PetscCallCUSPARSE(stat);
2938: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2939: }
2940: #else
2941: size_t bufSize2;
2942: /* ask bufferSize bytes for external memory */
2943: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2944: PetscCallCUSPARSE(stat);
2945: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2946: /* inspect the matrices A and B to understand the memory requirement for the next step */
2947: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2948: PetscCallCUSPARSE(stat);
2949: /* ask bufferSize again bytes for external memory */
2950: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2951: PetscCallCUSPARSE(stat);
2952: /* The CUSPARSE documentation is not clear, nor the API
2953: We need both buffers to perform the operations properly!
2954: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2955: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2956: is stored in the descriptor! What a messy API... */
2957: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2958: /* compute the intermediate product of A * B */
2959: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2960: PetscCallCUSPARSE(stat);
2961: /* get matrix C non-zero entries C_nnz1 */
2962: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2963: c->nz = (PetscInt)C_nnz1;
2964: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2965: mmdata->mmBufferSize / 1024));
2966: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2967: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2968: Ccsr->values = new THRUSTARRAY(c->nz);
2969: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2970: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2971: PetscCallCUSPARSE(stat);
2972: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2973: PetscCallCUSPARSE(stat);
2974: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2975: #else
2976: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2977: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2978: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2979: PetscCallCUSPARSE(stat);
2980: c->nz = cnz;
2981: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2982: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2983: Ccsr->values = new THRUSTARRAY(c->nz);
2984: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2986: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2987: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2988: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2989: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2990: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2991: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2992: PetscCallCUSPARSE(stat);
2993: #endif
2994: PetscCall(PetscLogGpuFlops(mmdata->flops));
2995: PetscCall(PetscLogGpuTimeEnd());
2996: finalizesym:
2997: c->singlemalloc = PETSC_FALSE;
2998: c->free_a = PETSC_TRUE;
2999: c->free_ij = PETSC_TRUE;
3000: PetscCall(PetscMalloc1(m + 1, &c->i));
3001: PetscCall(PetscMalloc1(c->nz, &c->j));
3002: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3003: PetscInt *d_i = c->i;
3004: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3005: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3006: ii = *Ccsr->row_offsets;
3007: jj = *Ccsr->column_indices;
3008: if (ciscompressed) d_i = c->compressedrow.i;
3009: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3010: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3011: } else {
3012: PetscInt *d_i = c->i;
3013: if (ciscompressed) d_i = c->compressedrow.i;
3014: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3015: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3016: }
3017: if (ciscompressed) { /* need to expand host row offsets */
3018: PetscInt r = 0;
3019: c->i[0] = 0;
3020: for (k = 0; k < c->compressedrow.nrows; k++) {
3021: const PetscInt next = c->compressedrow.rindex[k];
3022: const PetscInt old = c->compressedrow.i[k];
3023: for (; r < next; r++) c->i[r + 1] = old;
3024: }
3025: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3026: }
3027: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3028: PetscCall(PetscMalloc1(m, &c->ilen));
3029: PetscCall(PetscMalloc1(m, &c->imax));
3030: c->maxnz = c->nz;
3031: c->nonzerorowcnt = 0;
3032: c->rmax = 0;
3033: for (k = 0; k < m; k++) {
3034: const PetscInt nn = c->i[k + 1] - c->i[k];
3035: c->ilen[k] = c->imax[k] = nn;
3036: c->nonzerorowcnt += (PetscInt) !!nn;
3037: c->rmax = PetscMax(c->rmax, nn);
3038: }
3039: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3040: PetscCall(PetscMalloc1(c->nz, &c->a));
3041: Ccsr->num_entries = c->nz;
3043: C->nonzerostate++;
3044: PetscCall(PetscLayoutSetUp(C->rmap));
3045: PetscCall(PetscLayoutSetUp(C->cmap));
3046: Ccusp->nonzerostate = C->nonzerostate;
3047: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3048: C->preallocated = PETSC_TRUE;
3049: C->assembled = PETSC_FALSE;
3050: C->was_assembled = PETSC_FALSE;
3051: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3052: mmdata->reusesym = PETSC_TRUE;
3053: C->offloadmask = PETSC_OFFLOAD_GPU;
3054: }
3055: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3056: PetscFunctionReturn(PETSC_SUCCESS);
3057: }
3059: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3061: /* handles sparse or dense B */
3062: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3063: {
3064: Mat_Product *product = mat->product;
3065: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3067: PetscFunctionBegin;
3068: MatCheckProduct(mat, 1);
3069: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3070: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3071: if (product->type == MATPRODUCT_ABC) {
3072: Ciscusp = PETSC_FALSE;
3073: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3074: }
3075: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3076: PetscBool usecpu = PETSC_FALSE;
3077: switch (product->type) {
3078: case MATPRODUCT_AB:
3079: if (product->api_user) {
3080: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3081: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3082: PetscOptionsEnd();
3083: } else {
3084: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3085: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3086: PetscOptionsEnd();
3087: }
3088: break;
3089: case MATPRODUCT_AtB:
3090: if (product->api_user) {
3091: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3092: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3093: PetscOptionsEnd();
3094: } else {
3095: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3096: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3097: PetscOptionsEnd();
3098: }
3099: break;
3100: case MATPRODUCT_PtAP:
3101: if (product->api_user) {
3102: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3103: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3104: PetscOptionsEnd();
3105: } else {
3106: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3107: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3108: PetscOptionsEnd();
3109: }
3110: break;
3111: case MATPRODUCT_RARt:
3112: if (product->api_user) {
3113: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3114: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3115: PetscOptionsEnd();
3116: } else {
3117: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3118: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3119: PetscOptionsEnd();
3120: }
3121: break;
3122: case MATPRODUCT_ABC:
3123: if (product->api_user) {
3124: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3125: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3126: PetscOptionsEnd();
3127: } else {
3128: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3129: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3130: PetscOptionsEnd();
3131: }
3132: break;
3133: default:
3134: break;
3135: }
3136: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3137: }
3138: /* dispatch */
3139: if (isdense) {
3140: switch (product->type) {
3141: case MATPRODUCT_AB:
3142: case MATPRODUCT_AtB:
3143: case MATPRODUCT_ABt:
3144: case MATPRODUCT_PtAP:
3145: case MATPRODUCT_RARt:
3146: if (product->A->boundtocpu) {
3147: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3148: } else {
3149: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3150: }
3151: break;
3152: case MATPRODUCT_ABC:
3153: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3154: break;
3155: default:
3156: break;
3157: }
3158: } else if (Biscusp && Ciscusp) {
3159: switch (product->type) {
3160: case MATPRODUCT_AB:
3161: case MATPRODUCT_AtB:
3162: case MATPRODUCT_ABt:
3163: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3164: break;
3165: case MATPRODUCT_PtAP:
3166: case MATPRODUCT_RARt:
3167: case MATPRODUCT_ABC:
3168: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3169: break;
3170: default:
3171: break;
3172: }
3173: } else { /* fallback for AIJ */
3174: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3175: }
3176: PetscFunctionReturn(PETSC_SUCCESS);
3177: }
3179: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3180: {
3181: PetscFunctionBegin;
3182: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3183: PetscFunctionReturn(PETSC_SUCCESS);
3184: }
3186: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3187: {
3188: PetscFunctionBegin;
3189: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3190: PetscFunctionReturn(PETSC_SUCCESS);
3191: }
3193: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3194: {
3195: PetscFunctionBegin;
3196: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3197: PetscFunctionReturn(PETSC_SUCCESS);
3198: }
3200: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3201: {
3202: PetscFunctionBegin;
3203: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3204: PetscFunctionReturn(PETSC_SUCCESS);
3205: }
3207: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3208: {
3209: PetscFunctionBegin;
3210: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3211: PetscFunctionReturn(PETSC_SUCCESS);
3212: }
3214: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3215: {
3216: int i = blockIdx.x * blockDim.x + threadIdx.x;
3217: if (i < n) y[idx[i]] += x[i];
3218: }
3220: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3221: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3222: {
3223: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3224: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3225: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3226: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3227: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3228: PetscBool compressed;
3229: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3230: PetscInt nx, ny;
3231: #endif
3233: PetscFunctionBegin;
3234: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3235: if (!a->nz) {
3236: if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz));
3237: else PetscCall(VecSeq_CUDA::set(zz, 0));
3238: PetscFunctionReturn(PETSC_SUCCESS);
3239: }
3240: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3241: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3242: if (!trans) {
3243: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3244: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3245: } else {
3246: if (herm || !A->form_explicit_transpose) {
3247: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3248: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3249: } else {
3250: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3251: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3252: }
3253: }
3254: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3255: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3257: try {
3258: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3259: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3260: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3262: PetscCall(PetscLogGpuTimeBegin());
3263: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3264: /* z = A x + beta y.
3265: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3266: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3267: */
3268: xptr = xarray;
3269: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3270: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3271: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3272: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3273: allocated to accommodate different uses. So we get the length info directly from mat.
3274: */
3275: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3276: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3277: nx = mat->num_cols;
3278: ny = mat->num_rows;
3279: }
3280: #endif
3281: } else {
3282: /* z = A^T x + beta y
3283: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3284: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3285: */
3286: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3287: dptr = zarray;
3288: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3289: if (compressed) { /* Scatter x to work vector */
3290: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3292: thrust::for_each(
3293: #if PetscDefined(HAVE_THRUST_ASYNC)
3294: thrust::cuda::par.on(PetscDefaultCudaStream),
3295: #endif
3296: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3297: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3298: }
3299: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3300: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302: nx = mat->num_rows;
3303: ny = mat->num_cols;
3304: }
3305: #endif
3306: }
3308: /* csr_spmv does y = alpha op(A) x + beta y */
3309: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3310: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3311: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3312: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3313: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3314: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3315: PetscCallCUSPARSE(
3316: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3317: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3319: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3320: } else {
3321: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3322: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3323: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3324: }
3326: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3327: matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3328: #else
3329: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3330: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3331: #endif
3332: } else {
3333: if (cusparsestruct->nrows) {
3334: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3335: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3336: #else
3337: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3338: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3339: #endif
3340: }
3341: }
3342: PetscCall(PetscLogGpuTimeEnd());
3344: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3345: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3346: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3347: PetscCall(VecSeq_CUDA::copy(yy, zz)); /* zz = yy */
3348: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3349: PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3350: }
3351: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3352: PetscCall(VecSeq_CUDA::set(zz, 0));
3353: }
3355: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3356: if (compressed) {
3357: PetscCall(PetscLogGpuTimeBegin());
3358: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3359: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3360: prevent that. So I just add a ScatterAdd kernel.
3361: */
3362: #if 0
3363: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3364: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3365: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3366: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3367: VecCUDAPlusEquals());
3368: #else
3369: PetscInt n = matstruct->cprowIndices->size();
3370: ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3371: #endif
3372: PetscCall(PetscLogGpuTimeEnd());
3373: }
3374: } else {
3375: if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3376: }
3377: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3378: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3379: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3380: } catch (char *ex) {
3381: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3382: }
3383: if (yy) {
3384: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3385: } else {
3386: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3387: }
3388: PetscFunctionReturn(PETSC_SUCCESS);
3389: }
3391: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3392: {
3393: PetscFunctionBegin;
3394: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3395: PetscFunctionReturn(PETSC_SUCCESS);
3396: }
3398: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3399: {
3400: PetscObjectState onnz = A->nonzerostate;
3401: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3403: PetscFunctionBegin;
3404: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3405: if (onnz != A->nonzerostate && cusp->deviceMat) {
3406: PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3407: PetscCallCUDA(cudaFree(cusp->deviceMat));
3408: cusp->deviceMat = NULL;
3409: }
3410: PetscFunctionReturn(PETSC_SUCCESS);
3411: }
3413: /*@
3414: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3415: (the default parallel PETSc format). This matrix will ultimately pushed down
3416: to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3417: assembly performance the user should preallocate the matrix storage by setting
3418: the parameter nz (or the array nnz). By setting these parameters accurately,
3419: performance during matrix assembly can be increased by more than a factor of 50.
3421: Collective
3423: Input Parameters:
3424: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3425: . m - number of rows
3426: . n - number of columns
3427: . nz - number of nonzeros per row (same for all rows)
3428: - nnz - array containing the number of nonzeros in the various rows
3429: (possibly different for each row) or `NULL`
3431: Output Parameter:
3432: . A - the matrix
3434: Level: intermediate
3436: Notes:
3437: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3438: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3439: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3441: If `nnz` is given then `nz` is ignored
3443: The AIJ format, also called
3444: compressed row storage, is fully compatible with standard Fortran
3445: storage. That is, the stored row and column indices can begin at
3446: either one (as in Fortran) or zero. See the users' manual for details.
3448: Specify the preallocated storage with either nz or nnz (not both).
3449: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3450: allocation. For large problems you MUST preallocate memory or you
3451: will get TERRIBLE performance, see the users' manual chapter on matrices.
3453: By default, this format uses inodes (identical nodes) when possible, to
3454: improve numerical efficiency of matrix-vector products and solves. We
3455: search for consecutive rows with the same nonzero structure, thereby
3456: reusing matrix information to achieve increased efficiency.
3458: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3459: @*/
3460: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3461: {
3462: PetscFunctionBegin;
3463: PetscCall(MatCreate(comm, A));
3464: PetscCall(MatSetSizes(*A, m, n, m, n));
3465: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3466: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3467: PetscFunctionReturn(PETSC_SUCCESS);
3468: }
3470: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3471: {
3472: PetscFunctionBegin;
3473: if (A->factortype == MAT_FACTOR_NONE) {
3474: PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3475: } else {
3476: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3477: }
3478: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3479: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3480: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3481: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3482: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3483: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3484: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3485: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3486: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3487: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3488: PetscCall(MatDestroy_SeqAIJ(A));
3489: PetscFunctionReturn(PETSC_SUCCESS);
3490: }
3492: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3493: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3494: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3495: {
3496: PetscFunctionBegin;
3497: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3498: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3499: PetscFunctionReturn(PETSC_SUCCESS);
3500: }
3502: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3503: {
3504: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3505: Mat_SeqAIJCUSPARSE *cy;
3506: Mat_SeqAIJCUSPARSE *cx;
3507: PetscScalar *ay;
3508: const PetscScalar *ax;
3509: CsrMatrix *csry, *csrx;
3511: PetscFunctionBegin;
3512: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3513: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3514: if (X->ops->axpy != Y->ops->axpy) {
3515: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3516: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3517: PetscFunctionReturn(PETSC_SUCCESS);
3518: }
3519: /* if we are here, it means both matrices are bound to GPU */
3520: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3521: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3522: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3523: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3524: csry = (CsrMatrix *)cy->mat->mat;
3525: csrx = (CsrMatrix *)cx->mat->mat;
3526: /* see if we can turn this into a cublas axpy */
3527: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3528: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3529: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3530: if (eq) str = SAME_NONZERO_PATTERN;
3531: }
3532: /* spgeam is buggy with one column */
3533: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3535: if (str == SUBSET_NONZERO_PATTERN) {
3536: PetscScalar b = 1.0;
3537: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3538: size_t bufferSize;
3539: void *buffer;
3540: #endif
3542: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3543: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3544: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3545: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3546: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3547: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3548: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3549: PetscCall(PetscLogGpuTimeBegin());
3550: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3551: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3552: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3553: PetscCall(PetscLogGpuTimeEnd());
3554: PetscCallCUDA(cudaFree(buffer));
3555: #else
3556: PetscCall(PetscLogGpuTimeBegin());
3557: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3558: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3559: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3560: PetscCall(PetscLogGpuTimeEnd());
3561: #endif
3562: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3563: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3564: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3565: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3566: } else if (str == SAME_NONZERO_PATTERN) {
3567: cublasHandle_t cublasv2handle;
3568: PetscBLASInt one = 1, bnz = 1;
3570: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3571: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3572: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3573: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3574: PetscCall(PetscLogGpuTimeBegin());
3575: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3576: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3577: PetscCall(PetscLogGpuTimeEnd());
3578: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3579: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3580: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3581: } else {
3582: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3583: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3584: }
3585: PetscFunctionReturn(PETSC_SUCCESS);
3586: }
3588: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3589: {
3590: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3591: PetscScalar *ay;
3592: cublasHandle_t cublasv2handle;
3593: PetscBLASInt one = 1, bnz = 1;
3595: PetscFunctionBegin;
3596: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3597: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3598: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3599: PetscCall(PetscLogGpuTimeBegin());
3600: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3601: PetscCall(PetscLogGpuFlops(bnz));
3602: PetscCall(PetscLogGpuTimeEnd());
3603: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3604: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3605: PetscFunctionReturn(PETSC_SUCCESS);
3606: }
3608: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3609: {
3610: PetscBool both = PETSC_FALSE;
3611: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3613: PetscFunctionBegin;
3614: if (A->factortype == MAT_FACTOR_NONE) {
3615: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3616: if (spptr->mat) {
3617: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3618: if (matrix->values) {
3619: both = PETSC_TRUE;
3620: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3621: }
3622: }
3623: if (spptr->matTranspose) {
3624: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3625: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3626: }
3627: }
3628: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3629: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3630: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3631: else A->offloadmask = PETSC_OFFLOAD_CPU;
3632: PetscFunctionReturn(PETSC_SUCCESS);
3633: }
3635: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3636: {
3637: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3639: PetscFunctionBegin;
3640: if (A->factortype != MAT_FACTOR_NONE) {
3641: A->boundtocpu = flg;
3642: PetscFunctionReturn(PETSC_SUCCESS);
3643: }
3644: if (flg) {
3645: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3647: A->ops->scale = MatScale_SeqAIJ;
3648: A->ops->axpy = MatAXPY_SeqAIJ;
3649: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3650: A->ops->mult = MatMult_SeqAIJ;
3651: A->ops->multadd = MatMultAdd_SeqAIJ;
3652: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3653: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3654: A->ops->multhermitiantranspose = NULL;
3655: A->ops->multhermitiantransposeadd = NULL;
3656: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3657: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3658: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3659: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3660: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3661: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3662: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3663: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3664: } else {
3665: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3666: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3667: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3668: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3669: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3670: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3671: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3672: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3673: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3674: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3675: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3676: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3677: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3678: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3679: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3680: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3681: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3683: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3684: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3685: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3686: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3687: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3688: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3689: }
3690: A->boundtocpu = flg;
3691: if (flg && a->inode.size) {
3692: a->inode.use = PETSC_TRUE;
3693: } else {
3694: a->inode.use = PETSC_FALSE;
3695: }
3696: PetscFunctionReturn(PETSC_SUCCESS);
3697: }
3699: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3700: {
3701: Mat B;
3703: PetscFunctionBegin;
3704: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3705: if (reuse == MAT_INITIAL_MATRIX) {
3706: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3707: } else if (reuse == MAT_REUSE_MATRIX) {
3708: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3709: }
3710: B = *newmat;
3712: PetscCall(PetscFree(B->defaultvectype));
3713: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
3715: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3716: if (B->factortype == MAT_FACTOR_NONE) {
3717: Mat_SeqAIJCUSPARSE *spptr;
3718: PetscCall(PetscNew(&spptr));
3719: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3720: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3721: spptr->format = MAT_CUSPARSE_CSR;
3722: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3723: #if CUSPARSE_VERSION > 11301
3724: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3725: #else
3726: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3727: #endif
3728: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3729: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3730: #endif
3731: B->spptr = spptr;
3732: } else {
3733: Mat_SeqAIJCUSPARSETriFactors *spptr;
3735: PetscCall(PetscNew(&spptr));
3736: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3737: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3738: B->spptr = spptr;
3739: }
3740: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3741: }
3742: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3743: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3744: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3745: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3746: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3747: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3749: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3750: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3751: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3752: #if defined(PETSC_HAVE_HYPRE)
3753: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3754: #endif
3755: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3756: PetscFunctionReturn(PETSC_SUCCESS);
3757: }
3759: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3760: {
3761: PetscFunctionBegin;
3762: PetscCall(MatCreate_SeqAIJ(B));
3763: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3764: PetscFunctionReturn(PETSC_SUCCESS);
3765: }
3767: /*MC
3768: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3770: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3771: CSR, ELL, or Hybrid format.
3772: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3774: Options Database Keys:
3775: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3776: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3777: Other options include ell (ellpack) or hyb (hybrid).
3778: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3779: - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3781: Level: beginner
3783: .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3784: M*/
3786: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3788: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3789: {
3790: PetscFunctionBegin;
3791: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3792: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3793: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3794: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3795: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3797: PetscFunctionReturn(PETSC_SUCCESS);
3798: }
3800: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3801: {
3802: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3804: PetscFunctionBegin;
3805: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3806: delete cusp->cooPerm;
3807: delete cusp->cooPerm_a;
3808: cusp->cooPerm = NULL;
3809: cusp->cooPerm_a = NULL;
3810: if (cusp->use_extended_coo) {
3811: PetscCallCUDA(cudaFree(cusp->jmap_d));
3812: PetscCallCUDA(cudaFree(cusp->perm_d));
3813: }
3814: cusp->use_extended_coo = PETSC_FALSE;
3815: PetscFunctionReturn(PETSC_SUCCESS);
3816: }
3818: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3819: {
3820: PetscFunctionBegin;
3821: if (*cusparsestruct) {
3822: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3823: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3824: delete (*cusparsestruct)->workVector;
3825: delete (*cusparsestruct)->rowoffsets_gpu;
3826: delete (*cusparsestruct)->cooPerm;
3827: delete (*cusparsestruct)->cooPerm_a;
3828: delete (*cusparsestruct)->csr2csc_i;
3829: if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3830: if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3831: if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3832: PetscCall(PetscFree(*cusparsestruct));
3833: }
3834: PetscFunctionReturn(PETSC_SUCCESS);
3835: }
3837: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3838: {
3839: PetscFunctionBegin;
3840: if (*mat) {
3841: delete (*mat)->values;
3842: delete (*mat)->column_indices;
3843: delete (*mat)->row_offsets;
3844: delete *mat;
3845: *mat = 0;
3846: }
3847: PetscFunctionReturn(PETSC_SUCCESS);
3848: }
3850: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3851: {
3852: PetscFunctionBegin;
3853: if (*trifactor) {
3854: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3855: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3856: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3857: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3858: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3859: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3860: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3861: #endif
3862: PetscCall(PetscFree(*trifactor));
3863: }
3864: PetscFunctionReturn(PETSC_SUCCESS);
3865: }
3867: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3868: {
3869: CsrMatrix *mat;
3871: PetscFunctionBegin;
3872: if (*matstruct) {
3873: if ((*matstruct)->mat) {
3874: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877: #else
3878: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3879: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880: #endif
3881: } else {
3882: mat = (CsrMatrix *)(*matstruct)->mat;
3883: PetscCall(CsrMatrix_Destroy(&mat));
3884: }
3885: }
3886: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3887: delete (*matstruct)->cprowIndices;
3888: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3889: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3890: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3892: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3894: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895: for (int i = 0; i < 3; i++) {
3896: if (mdata->cuSpMV[i].initialized) {
3897: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3898: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3899: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900: }
3901: }
3902: #endif
3903: delete *matstruct;
3904: *matstruct = NULL;
3905: }
3906: PetscFunctionReturn(PETSC_SUCCESS);
3907: }
3909: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3910: {
3911: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3913: PetscFunctionBegin;
3914: if (fs) {
3915: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919: delete fs->rpermIndices;
3920: delete fs->cpermIndices;
3921: delete fs->workVector;
3922: fs->rpermIndices = NULL;
3923: fs->cpermIndices = NULL;
3924: fs->workVector = NULL;
3925: if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926: if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927: fs->init_dev_prop = PETSC_FALSE;
3928: #if CUSPARSE_VERSION >= 11500
3929: PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930: PetscCallCUDA(cudaFree(fs->csrColIdx));
3931: PetscCallCUDA(cudaFree(fs->csrVal));
3932: PetscCallCUDA(cudaFree(fs->X));
3933: PetscCallCUDA(cudaFree(fs->Y));
3934: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3937: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3951: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3952: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953: #endif
3954: }
3955: PetscFunctionReturn(PETSC_SUCCESS);
3956: }
3958: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3959: {
3960: PetscFunctionBegin;
3961: if (*trifactors) {
3962: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3964: PetscCall(PetscFree(*trifactors));
3965: }
3966: PetscFunctionReturn(PETSC_SUCCESS);
3967: }
3969: struct IJCompare {
3970: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3971: {
3972: if (t1.get<0>() < t2.get<0>()) return true;
3973: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3974: return false;
3975: }
3976: };
3978: struct IJEqual {
3979: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3980: {
3981: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3982: return true;
3983: }
3984: };
3986: struct IJDiff {
3987: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3988: };
3990: struct IJSum {
3991: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3992: };
3994: #include <thrust/iterator/discard_iterator.h>
3995: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3996: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3997: {
3998: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3999: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4000: THRUSTARRAY *cooPerm_v = NULL;
4001: thrust::device_ptr<const PetscScalar> d_v;
4002: CsrMatrix *matrix;
4003: PetscInt n;
4005: PetscFunctionBegin;
4006: PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4007: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4008: if (!cusp->cooPerm) {
4009: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4010: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4011: PetscFunctionReturn(PETSC_SUCCESS);
4012: }
4013: matrix = (CsrMatrix *)cusp->mat->mat;
4014: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4015: if (!v) {
4016: if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4017: goto finalize;
4018: }
4019: n = cusp->cooPerm->size();
4020: if (isCudaMem(v)) {
4021: d_v = thrust::device_pointer_cast(v);
4022: } else {
4023: cooPerm_v = new THRUSTARRAY(n);
4024: cooPerm_v->assign(v, v + n);
4025: d_v = cooPerm_v->data();
4026: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4027: }
4028: PetscCall(PetscLogGpuTimeBegin());
4029: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4030: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4031: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4032: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4033: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4034: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4035: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4036: */
4037: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4038: thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4039: delete cooPerm_w;
4040: } else {
4041: /* all nonzeros in d_v[] are unique entries */
4042: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4043: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4044: thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
4045: }
4046: } else {
4047: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4048: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4049: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4050: } else {
4051: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4052: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4053: thrust::for_each(zibit, zieit, VecCUDAEquals());
4054: }
4055: }
4056: PetscCall(PetscLogGpuTimeEnd());
4057: finalize:
4058: delete cooPerm_v;
4059: A->offloadmask = PETSC_OFFLOAD_GPU;
4060: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4061: /* shorter version of MatAssemblyEnd_SeqAIJ */
4062: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4063: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4064: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4065: a->reallocs = 0;
4066: A->info.mallocs += 0;
4067: A->info.nz_unneeded = 0;
4068: A->assembled = A->was_assembled = PETSC_TRUE;
4069: A->num_ass++;
4070: PetscFunctionReturn(PETSC_SUCCESS);
4071: }
4073: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4074: {
4075: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4077: PetscFunctionBegin;
4078: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4079: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4080: if (destroy) {
4081: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4082: delete cusp->csr2csc_i;
4083: cusp->csr2csc_i = NULL;
4084: }
4085: A->transupdated = PETSC_FALSE;
4086: PetscFunctionReturn(PETSC_SUCCESS);
4087: }
4089: #include <thrust/binary_search.h>
4090: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4091: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4092: {
4093: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4094: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4095: PetscInt cooPerm_n, nzr = 0;
4097: PetscFunctionBegin;
4098: PetscCall(PetscLayoutSetUp(A->rmap));
4099: PetscCall(PetscLayoutSetUp(A->cmap));
4100: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4101: if (n != cooPerm_n) {
4102: delete cusp->cooPerm;
4103: delete cusp->cooPerm_a;
4104: cusp->cooPerm = NULL;
4105: cusp->cooPerm_a = NULL;
4106: }
4107: if (n) {
4108: thrust::device_ptr<PetscInt> d_i, d_j;
4109: PetscInt *d_raw_i, *d_raw_j;
4110: PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4111: PetscMemType imtype, jmtype;
4113: PetscCall(PetscGetMemType(coo_i, &imtype));
4114: if (PetscMemTypeHost(imtype)) {
4115: PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4116: PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4117: d_i = thrust::device_pointer_cast(d_raw_i);
4118: free_raw_i = PETSC_TRUE;
4119: PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4120: } else {
4121: d_i = thrust::device_pointer_cast(coo_i);
4122: }
4124: PetscCall(PetscGetMemType(coo_j, &jmtype));
4125: if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4126: PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4127: PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4128: d_j = thrust::device_pointer_cast(d_raw_j);
4129: free_raw_j = PETSC_TRUE;
4130: PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4131: } else {
4132: d_j = thrust::device_pointer_cast(coo_j);
4133: }
4135: THRUSTINTARRAY ii(A->rmap->n);
4137: if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4138: if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4140: /* Ex.
4141: n = 6
4142: coo_i = [3,3,1,4,1,4]
4143: coo_j = [3,2,2,5,2,6]
4144: */
4145: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4146: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4148: PetscCall(PetscLogGpuTimeBegin());
4149: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4150: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4151: (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */
4152: THRUSTINTARRAY w(d_j, d_j + n);
4154: /*
4155: d_i = [1,1,3,3,4,4]
4156: d_j = [2,2,2,3,5,6]
4157: cooPerm = [2,4,1,0,3,5]
4158: */
4159: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4161: /*
4162: d_i = [1,3,3,4,4,x]
4163: ^ekey
4164: d_j = [2,2,3,5,6,x]
4165: ^nekye
4166: */
4167: if (nekey == ekey) { /* all entries are unique */
4168: delete cusp->cooPerm_a;
4169: cusp->cooPerm_a = NULL;
4170: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4171: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4172: adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4173: adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4174: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4175: w[0] = 0;
4176: thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
4177: thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4178: }
4179: thrust::counting_iterator<PetscInt> search_begin(0);
4180: thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4181: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4182: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4183: PetscCall(PetscLogGpuTimeEnd());
4185: PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4186: a->singlemalloc = PETSC_FALSE;
4187: a->free_a = PETSC_TRUE;
4188: a->free_ij = PETSC_TRUE;
4189: PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4190: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4191: PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4192: a->nz = a->maxnz = a->i[A->rmap->n];
4193: a->rmax = 0;
4194: PetscCall(PetscMalloc1(a->nz, &a->a));
4195: PetscCall(PetscMalloc1(a->nz, &a->j));
4196: PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4197: if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4198: if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4199: for (PetscInt i = 0; i < A->rmap->n; i++) {
4200: const PetscInt nnzr = a->i[i + 1] - a->i[i];
4201: nzr += (PetscInt) !!(nnzr);
4202: a->ilen[i] = a->imax[i] = nnzr;
4203: a->rmax = PetscMax(a->rmax, nnzr);
4204: }
4205: a->nonzerorowcnt = nzr;
4206: A->preallocated = PETSC_TRUE;
4207: PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4208: PetscCall(MatMarkDiagonal_SeqAIJ(A));
4209: if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4210: if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4211: } else {
4212: PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4213: }
4214: PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
4216: /* We want to allocate the CUSPARSE struct for matvec now.
4217: The code is so convoluted now that I prefer to copy zeros */
4218: PetscCall(PetscArrayzero(a->a, a->nz));
4219: PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4220: A->offloadmask = PETSC_OFFLOAD_CPU;
4221: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4222: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4223: PetscFunctionReturn(PETSC_SUCCESS);
4224: }
4226: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4227: {
4228: Mat_SeqAIJ *seq;
4229: Mat_SeqAIJCUSPARSE *dev;
4230: PetscBool coo_basic = PETSC_TRUE;
4231: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
4233: PetscFunctionBegin;
4234: PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4235: PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4236: if (coo_i) {
4237: PetscCall(PetscGetMemType(coo_i, &mtype));
4238: if (PetscMemTypeHost(mtype)) {
4239: for (PetscCount k = 0; k < coo_n; k++) {
4240: if (coo_i[k] < 0 || coo_j[k] < 0) {
4241: coo_basic = PETSC_FALSE;
4242: break;
4243: }
4244: }
4245: }
4246: }
4248: if (coo_basic) { /* i,j are on device or do not contain negative indices */
4249: PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4250: } else {
4251: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4252: mat->offloadmask = PETSC_OFFLOAD_CPU;
4253: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4254: seq = static_cast<Mat_SeqAIJ *>(mat->data);
4255: dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4256: PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4257: PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4258: PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4259: PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4260: dev->use_extended_coo = PETSC_TRUE;
4261: }
4262: PetscFunctionReturn(PETSC_SUCCESS);
4263: }
4265: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4266: {
4267: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4268: const PetscCount grid_size = gridDim.x * blockDim.x;
4269: for (; i < nnz; i += grid_size) {
4270: PetscScalar sum = 0.0;
4271: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4272: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4273: }
4274: }
4276: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4277: {
4278: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4279: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4280: PetscCount Annz = seq->nz;
4281: PetscMemType memtype;
4282: const PetscScalar *v1 = v;
4283: PetscScalar *Aa;
4285: PetscFunctionBegin;
4286: if (dev->use_extended_coo) {
4287: PetscCall(PetscGetMemType(v, &memtype));
4288: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4289: PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4290: PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4291: }
4293: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4294: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4296: if (Annz) {
4297: MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4298: PetscCallCUDA(cudaPeekAtLastError());
4299: }
4301: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4302: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4304: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4305: } else {
4306: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4307: }
4308: PetscFunctionReturn(PETSC_SUCCESS);
4309: }
4311: /*@C
4312: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4314: Not Collective
4316: Input Parameters:
4317: + A - the matrix
4318: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4320: Output Parameters:
4321: + ia - the CSR row pointers
4322: - ja - the CSR column indices
4324: Level: developer
4326: Note:
4327: When compressed is true, the CSR structure does not contain empty rows
4329: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4330: @*/
4331: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4332: {
4333: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4334: CsrMatrix *csr;
4335: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4337: PetscFunctionBegin;
4339: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4340: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4341: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4342: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4343: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4344: csr = (CsrMatrix *)cusp->mat->mat;
4345: if (i) {
4346: if (!compressed && a->compressedrow.use) { /* need full row offset */
4347: if (!cusp->rowoffsets_gpu) {
4348: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4349: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4350: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4351: }
4352: *i = cusp->rowoffsets_gpu->data().get();
4353: } else *i = csr->row_offsets->data().get();
4354: }
4355: if (j) *j = csr->column_indices->data().get();
4356: PetscFunctionReturn(PETSC_SUCCESS);
4357: }
4359: /*@C
4360: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4362: Not Collective
4364: Input Parameters:
4365: + A - the matrix
4366: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4367: . ia - the CSR row pointers
4368: - ja - the CSR column indices
4370: Level: developer
4372: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4373: @*/
4374: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4375: {
4376: PetscFunctionBegin;
4378: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4379: if (i) *i = NULL;
4380: if (j) *j = NULL;
4381: PetscFunctionReturn(PETSC_SUCCESS);
4382: }
4384: /*@C
4385: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4387: Not Collective
4389: Input Parameter:
4390: . A - a `MATSEQAIJCUSPARSE` matrix
4392: Output Parameter:
4393: . a - pointer to the device data
4395: Level: developer
4397: Note:
4398: May trigger host-device copies if up-to-date matrix data is on host
4400: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4401: @*/
4402: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4403: {
4404: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4405: CsrMatrix *csr;
4407: PetscFunctionBegin;
4410: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4411: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4412: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4414: csr = (CsrMatrix *)cusp->mat->mat;
4415: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4416: *a = csr->values->data().get();
4417: PetscFunctionReturn(PETSC_SUCCESS);
4418: }
4420: /*@C
4421: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4423: Not Collective
4425: Input Parameters:
4426: + A - a `MATSEQAIJCUSPARSE` matrix
4427: - a - pointer to the device data
4429: Level: developer
4431: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4432: @*/
4433: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4434: {
4435: PetscFunctionBegin;
4438: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4439: *a = NULL;
4440: PetscFunctionReturn(PETSC_SUCCESS);
4441: }
4443: /*@C
4444: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4446: Not Collective
4448: Input Parameter:
4449: . A - a `MATSEQAIJCUSPARSE` matrix
4451: Output Parameter:
4452: . a - pointer to the device data
4454: Level: developer
4456: Note:
4457: May trigger host-device copies if up-to-date matrix data is on host
4459: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4460: @*/
4461: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4462: {
4463: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4464: CsrMatrix *csr;
4466: PetscFunctionBegin;
4469: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473: csr = (CsrMatrix *)cusp->mat->mat;
4474: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4475: *a = csr->values->data().get();
4476: A->offloadmask = PETSC_OFFLOAD_GPU;
4477: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4478: PetscFunctionReturn(PETSC_SUCCESS);
4479: }
4480: /*@C
4481: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4483: Not Collective
4485: Input Parameters:
4486: + A - a `MATSEQAIJCUSPARSE` matrix
4487: - a - pointer to the device data
4489: Level: developer
4491: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4492: @*/
4493: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4494: {
4495: PetscFunctionBegin;
4498: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4499: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4500: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4501: *a = NULL;
4502: PetscFunctionReturn(PETSC_SUCCESS);
4503: }
4505: /*@C
4506: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4508: Not Collective
4510: Input Parameter:
4511: . A - a `MATSEQAIJCUSPARSE` matrix
4513: Output Parameter:
4514: . a - pointer to the device data
4516: Level: developer
4518: Note:
4519: Does not trigger host-device copies and flags data validity on the GPU
4521: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4522: @*/
4523: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4524: {
4525: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4526: CsrMatrix *csr;
4528: PetscFunctionBegin;
4531: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4533: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534: csr = (CsrMatrix *)cusp->mat->mat;
4535: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536: *a = csr->values->data().get();
4537: A->offloadmask = PETSC_OFFLOAD_GPU;
4538: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4539: PetscFunctionReturn(PETSC_SUCCESS);
4540: }
4542: /*@C
4543: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4545: Not Collective
4547: Input Parameters:
4548: + A - a `MATSEQAIJCUSPARSE` matrix
4549: - a - pointer to the device data
4551: Level: developer
4553: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4554: @*/
4555: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4556: {
4557: PetscFunctionBegin;
4560: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4562: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4563: *a = NULL;
4564: PetscFunctionReturn(PETSC_SUCCESS);
4565: }
4567: struct IJCompare4 {
4568: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4569: {
4570: if (t1.get<0>() < t2.get<0>()) return true;
4571: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4572: return false;
4573: }
4574: };
4576: struct Shift {
4577: int _shift;
4579: Shift(int shift) : _shift(shift) { }
4580: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4581: };
4583: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4584: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4585: {
4586: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4590: PetscInt Annz, Bnnz;
4591: cusparseStatus_t stat;
4592: PetscInt i, m, n, zero = 0;
4594: PetscFunctionBegin;
4598: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4600: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4601: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604: if (reuse == MAT_INITIAL_MATRIX) {
4605: m = A->rmap->n;
4606: n = A->cmap->n + B->cmap->n;
4607: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4608: PetscCall(MatSetSizes(*C, m, n, m, n));
4609: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610: c = (Mat_SeqAIJ *)(*C)->data;
4611: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4613: Ccsr = new CsrMatrix;
4614: Cmat->cprowIndices = NULL;
4615: c->compressedrow.use = PETSC_FALSE;
4616: c->compressedrow.nrows = 0;
4617: c->compressedrow.i = NULL;
4618: c->compressedrow.rindex = NULL;
4619: Ccusp->workVector = NULL;
4620: Ccusp->nrows = m;
4621: Ccusp->mat = Cmat;
4622: Ccusp->mat->mat = Ccsr;
4623: Ccsr->num_rows = m;
4624: Ccsr->num_cols = n;
4625: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4626: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4627: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4628: PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4629: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4630: PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4631: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4632: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4633: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4634: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4635: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4636: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4637: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4639: Acsr = (CsrMatrix *)Acusp->mat->mat;
4640: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4641: Annz = (PetscInt)Acsr->column_indices->size();
4642: Bnnz = (PetscInt)Bcsr->column_indices->size();
4643: c->nz = Annz + Bnnz;
4644: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4645: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646: Ccsr->values = new THRUSTARRAY(c->nz);
4647: Ccsr->num_entries = c->nz;
4648: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4649: if (c->nz) {
4650: auto Acoo = new THRUSTINTARRAY32(Annz);
4651: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4652: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4653: THRUSTINTARRAY32 *Aroff, *Broff;
4655: if (a->compressedrow.use) { /* need full row offset */
4656: if (!Acusp->rowoffsets_gpu) {
4657: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4659: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660: }
4661: Aroff = Acusp->rowoffsets_gpu;
4662: } else Aroff = Acsr->row_offsets;
4663: if (b->compressedrow.use) { /* need full row offset */
4664: if (!Bcusp->rowoffsets_gpu) {
4665: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4667: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668: }
4669: Broff = Bcusp->rowoffsets_gpu;
4670: } else Broff = Bcsr->row_offsets;
4671: PetscCall(PetscLogGpuTimeBegin());
4672: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4673: PetscCallCUSPARSE(stat);
4674: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4675: PetscCallCUSPARSE(stat);
4676: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4677: auto Aperm = thrust::make_constant_iterator(1);
4678: auto Bperm = thrust::make_constant_iterator(0);
4679: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4682: #else
4683: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4684: auto Bcib = Bcsr->column_indices->begin();
4685: auto Bcie = Bcsr->column_indices->end();
4686: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4687: #endif
4688: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4689: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4690: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4691: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4692: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4693: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694: auto p1 = Ccusp->cooPerm->begin();
4695: auto p2 = Ccusp->cooPerm->begin();
4696: thrust::advance(p2, Annz);
4697: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4698: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4699: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4700: #endif
4701: auto cci = thrust::make_counting_iterator(zero);
4702: auto cce = thrust::make_counting_iterator(c->nz);
4703: #if 0 //Errors on SUMMIT cuda 11.1.0
4704: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4705: #else
4706: auto pred = thrust::identity<int>();
4707: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4709: #endif
4710: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4711: PetscCallCUSPARSE(stat);
4712: PetscCall(PetscLogGpuTimeEnd());
4713: delete wPerm;
4714: delete Acoo;
4715: delete Bcoo;
4716: delete Ccoo;
4717: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4718: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4719: PetscCallCUSPARSE(stat);
4720: #endif
4721: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4722: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4723: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726: CsrMatrix *CcsrT = new CsrMatrix;
4727: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4730: (*C)->form_explicit_transpose = PETSC_TRUE;
4731: (*C)->transupdated = PETSC_TRUE;
4732: Ccusp->rowoffsets_gpu = NULL;
4733: CmatT->cprowIndices = NULL;
4734: CmatT->mat = CcsrT;
4735: CcsrT->num_rows = n;
4736: CcsrT->num_cols = m;
4737: CcsrT->num_entries = c->nz;
4739: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4740: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741: CcsrT->values = new THRUSTARRAY(c->nz);
4743: PetscCall(PetscLogGpuTimeBegin());
4744: auto rT = CcsrT->row_offsets->begin();
4745: if (AT) {
4746: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747: thrust::advance(rT, -1);
4748: }
4749: if (BT) {
4750: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752: thrust::copy(titb, tite, rT);
4753: }
4754: auto cT = CcsrT->column_indices->begin();
4755: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757: auto vT = CcsrT->values->begin();
4758: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4760: PetscCall(PetscLogGpuTimeEnd());
4762: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4763: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4764: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4765: PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4766: PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4767: PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4768: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4769: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4770: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4772: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4773: PetscCallCUSPARSE(stat);
4774: #endif
4775: Ccusp->matTranspose = CmatT;
4776: }
4777: }
4779: c->singlemalloc = PETSC_FALSE;
4780: c->free_a = PETSC_TRUE;
4781: c->free_ij = PETSC_TRUE;
4782: PetscCall(PetscMalloc1(m + 1, &c->i));
4783: PetscCall(PetscMalloc1(c->nz, &c->j));
4784: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787: ii = *Ccsr->row_offsets;
4788: jj = *Ccsr->column_indices;
4789: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4790: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791: } else {
4792: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4793: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794: }
4795: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4796: PetscCall(PetscMalloc1(m, &c->ilen));
4797: PetscCall(PetscMalloc1(m, &c->imax));
4798: c->maxnz = c->nz;
4799: c->nonzerorowcnt = 0;
4800: c->rmax = 0;
4801: for (i = 0; i < m; i++) {
4802: const PetscInt nn = c->i[i + 1] - c->i[i];
4803: c->ilen[i] = c->imax[i] = nn;
4804: c->nonzerorowcnt += (PetscInt) !!nn;
4805: c->rmax = PetscMax(c->rmax, nn);
4806: }
4807: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4808: PetscCall(PetscMalloc1(c->nz, &c->a));
4809: (*C)->nonzerostate++;
4810: PetscCall(PetscLayoutSetUp((*C)->rmap));
4811: PetscCall(PetscLayoutSetUp((*C)->cmap));
4812: Ccusp->nonzerostate = (*C)->nonzerostate;
4813: (*C)->preallocated = PETSC_TRUE;
4814: } else {
4815: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816: c = (Mat_SeqAIJ *)(*C)->data;
4817: if (c->nz) {
4818: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4819: PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4821: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4822: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4823: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4824: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4825: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826: Acsr = (CsrMatrix *)Acusp->mat->mat;
4827: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4833: PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834: auto pmid = Ccusp->cooPerm->begin();
4835: thrust::advance(pmid, Acsr->num_entries);
4836: PetscCall(PetscLogGpuTimeBegin());
4837: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4838: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839: thrust::for_each(zibait, zieait, VecCUDAEquals());
4840: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4841: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4843: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4844: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4845: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850: auto vT = CcsrT->values->begin();
4851: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4853: (*C)->transupdated = PETSC_TRUE;
4854: }
4855: PetscCall(PetscLogGpuTimeEnd());
4856: }
4857: }
4858: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859: (*C)->assembled = PETSC_TRUE;
4860: (*C)->was_assembled = PETSC_FALSE;
4861: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4862: PetscFunctionReturn(PETSC_SUCCESS);
4863: }
4865: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4866: {
4867: bool dmem;
4868: const PetscScalar *av;
4870: PetscFunctionBegin;
4871: dmem = isCudaMem(v);
4872: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4873: if (n && idx) {
4874: THRUSTINTARRAY widx(n);
4875: widx.assign(idx, idx + n);
4876: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4878: THRUSTARRAY *w = NULL;
4879: thrust::device_ptr<PetscScalar> dv;
4880: if (dmem) {
4881: dv = thrust::device_pointer_cast(v);
4882: } else {
4883: w = new THRUSTARRAY(n);
4884: dv = w->data();
4885: }
4886: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4888: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4889: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4890: thrust::for_each(zibit, zieit, VecCUDAEquals());
4891: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4892: delete w;
4893: } else {
4894: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4895: }
4896: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4897: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4898: PetscFunctionReturn(PETSC_SUCCESS);
4899: }