Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17:   // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18:   #include <thrust/async/for_each.h>
 19: #endif
 20: #include <thrust/iterator/constant_iterator.h>
 21: #include <thrust/remove.h>
 22: #include <thrust/sort.h>
 23: #include <thrust/unique.h>

 25: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 26: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 27: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 28:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 30:   typedef enum {
 31:       CUSPARSE_MV_ALG_DEFAULT = 0,
 32:       CUSPARSE_COOMV_ALG      = 1,
 33:       CUSPARSE_CSRMV_ALG1     = 2,
 34:       CUSPARSE_CSRMV_ALG2     = 3
 35:   } cusparseSpMVAlg_t;

 37:   typedef enum {
 38:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 39:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 40:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 41:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 42:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 43:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 44:       CUSPARSE_SPMM_COO_ALG1    = 1,
 45:       CUSPARSE_SPMM_COO_ALG2    = 2,
 46:       CUSPARSE_SPMM_COO_ALG3    = 3,
 47:       CUSPARSE_SPMM_COO_ALG4    = 5,
 48:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 49:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 50:   } cusparseSpMMAlg_t;

 52:   typedef enum {
 53:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
 54:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
 55:   } cusparseCsr2CscAlg_t;
 56:   */
 57: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 58: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 59: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 60: #endif

 62: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 63: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 64: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);

 66: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 67: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 68: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);

 70: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 72: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 74: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 75: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 76: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 77: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 78: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 79: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 80: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 81: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 82: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 83: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 85: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 86: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 88: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 89: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);

 91: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 92: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 94: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 95: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 96: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 98: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 99: {
100:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

102:   PetscFunctionBegin;
103:   switch (op) {
104:   case MAT_CUSPARSE_MULT:
105:     cusparsestruct->format = format;
106:     break;
107:   case MAT_CUSPARSE_ALL:
108:     cusparsestruct->format = format;
109:     break;
110:   default:
111:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112:   }
113:   PetscFunctionReturn(PETSC_SUCCESS);
114: }

116: /*@
117:    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118:    operation. Only the `MatMult()` operation can use different GPU storage formats

120:    Not Collective

122:    Input Parameters:
123: +  A - Matrix of type `MATSEQAIJCUSPARSE`
124: .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125:         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

128:    Level: intermediate

130: .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131: @*/
132: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133: {
134:   PetscFunctionBegin;
136:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141: {
142:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

144:   PetscFunctionBegin;
145:   cusparsestruct->use_cpu_solve = use_cpu;
146:   PetscFunctionReturn(PETSC_SUCCESS);
147: }

149: /*@
150:    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

152:    Input Parameters:
153: +  A - Matrix of type `MATSEQAIJCUSPARSE`
154: -  use_cpu - set flag for using the built-in CPU `MatSolve()`

156:    Level: intermediate

158:    Note:
159:    The cuSparse LU solver currently computes the factors with the built-in CPU method
160:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161:    This method to specify if the solve is done on the CPU or GPU (GPU is the default).

163: .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164: @*/
165: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166: {
167:   PetscFunctionBegin;
169:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170:   PetscFunctionReturn(PETSC_SUCCESS);
171: }

173: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174: {
175:   PetscFunctionBegin;
176:   switch (op) {
177:   case MAT_FORM_EXPLICIT_TRANSPOSE:
178:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180:     A->form_explicit_transpose = flg;
181:     break;
182:   default:
183:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184:     break;
185:   }
186:   PetscFunctionReturn(PETSC_SUCCESS);
187: }

189: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

191: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192: {
193:   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
194:   IS                  isrow = b->row, iscol = b->col;
195:   PetscBool           row_identity, col_identity;
196:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;

198:   PetscFunctionBegin;
199:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
201:   B->offloadmask = PETSC_OFFLOAD_CPU;
202:   /* determine which version of MatSolve needs to be used. */
203:   PetscCall(ISIdentity(isrow, &row_identity));
204:   PetscCall(ISIdentity(iscol, &col_identity));

206:   if (!cusparsestruct->use_cpu_solve) {
207:     if (row_identity && col_identity) {
208:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210:     } else {
211:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
212:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213:     }
214:   }
215:   B->ops->matsolve          = NULL;
216:   B->ops->matsolvetranspose = NULL;

218:   /* get the triangular factors */
219:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
220:   PetscFunctionReturn(PETSC_SUCCESS);
221: }

223: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
224: {
225:   MatCUSPARSEStorageFormat format;
226:   PetscBool                flg;
227:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

229:   PetscFunctionBegin;
230:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
231:   if (A->factortype == MAT_FACTOR_NONE) {
232:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
233:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

235:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
236:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
237:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
238:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
239: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
240:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
241:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
242:   #if CUSPARSE_VERSION > 11301
243:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
244:   #else
245:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
246:   #endif
247:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
248:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

250:     PetscCall(
251:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
252:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
253: #endif
254:   }
255:   PetscOptionsHeadEnd();
256:   PetscFunctionReturn(PETSC_SUCCESS);
257: }

259: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
260: {
261:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
262:   PetscInt                           n                  = A->rmap->n;
263:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
264:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
265:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
266:   const MatScalar                   *aa = a->a, *v;
267:   PetscInt                          *AiLo, *AjLo;
268:   PetscInt                           i, nz, nzLower, offset, rowOffset;

270:   PetscFunctionBegin;
271:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
272:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
273:     try {
274:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
275:       nzLower = n + ai[n] - ai[1];
276:       if (!loTriFactor) {
277:         PetscScalar *AALo;

279:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

281:         /* Allocate Space for the lower triangular matrix */
282:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
283:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

285:         /* Fill the lower triangular matrix */
286:         AiLo[0]   = (PetscInt)0;
287:         AiLo[n]   = nzLower;
288:         AjLo[0]   = (PetscInt)0;
289:         AALo[0]   = (MatScalar)1.0;
290:         v         = aa;
291:         vi        = aj;
292:         offset    = 1;
293:         rowOffset = 1;
294:         for (i = 1; i < n; i++) {
295:           nz = ai[i + 1] - ai[i];
296:           /* additional 1 for the term on the diagonal */
297:           AiLo[i] = rowOffset;
298:           rowOffset += nz + 1;

300:           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
301:           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));

303:           offset += nz;
304:           AjLo[offset] = (PetscInt)i;
305:           AALo[offset] = (MatScalar)1.0;
306:           offset += 1;

308:           v += nz;
309:           vi += nz;
310:         }

312:         /* allocate space for the triangular factor information */
313:         PetscCall(PetscNew(&loTriFactor));
314:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
315:         /* Create the matrix description */
316:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
317:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
318: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
319:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
320: #else
321:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
322: #endif
323:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
324:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

326:         /* set the operation */
327:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

329:         /* set the matrix */
330:         loTriFactor->csrMat              = new CsrMatrix;
331:         loTriFactor->csrMat->num_rows    = n;
332:         loTriFactor->csrMat->num_cols    = n;
333:         loTriFactor->csrMat->num_entries = nzLower;

335:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
336:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

338:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

341:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

344:         /* Create the solve analysis information */
345:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
346:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
347: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
348:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
349:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
350:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
351: #endif

353:         /* perform the solve analysis */
354:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
355:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
356:         PetscCallCUDA(WaitForCUDA());
357:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

359:         /* assign the pointer */
360:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
361:         loTriFactor->AA_h                                          = AALo;
362:         PetscCallCUDA(cudaFreeHost(AiLo));
363:         PetscCallCUDA(cudaFreeHost(AjLo));
364:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
365:       } else { /* update values only */
366:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
367:         /* Fill the lower triangular matrix */
368:         loTriFactor->AA_h[0] = 1.0;
369:         v                    = aa;
370:         vi                   = aj;
371:         offset               = 1;
372:         for (i = 1; i < n; i++) {
373:           nz = ai[i + 1] - ai[i];
374:           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
375:           offset += nz;
376:           loTriFactor->AA_h[offset] = 1.0;
377:           offset += 1;
378:           v += nz;
379:         }
380:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
381:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
382:       }
383:     } catch (char *ex) {
384:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
385:     }
386:   }
387:   PetscFunctionReturn(PETSC_SUCCESS);
388: }

390: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
391: {
392:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
393:   PetscInt                           n                  = A->rmap->n;
394:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
395:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
396:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
397:   const MatScalar                   *aa = a->a, *v;
398:   PetscInt                          *AiUp, *AjUp;
399:   PetscInt                           i, nz, nzUpper, offset;

401:   PetscFunctionBegin;
402:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
403:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
404:     try {
405:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
406:       nzUpper = adiag[0] - adiag[n];
407:       if (!upTriFactor) {
408:         PetscScalar *AAUp;

410:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

412:         /* Allocate Space for the upper triangular matrix */
413:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
414:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

416:         /* Fill the upper triangular matrix */
417:         AiUp[0] = (PetscInt)0;
418:         AiUp[n] = nzUpper;
419:         offset  = nzUpper;
420:         for (i = n - 1; i >= 0; i--) {
421:           v  = aa + adiag[i + 1] + 1;
422:           vi = aj + adiag[i + 1] + 1;

424:           /* number of elements NOT on the diagonal */
425:           nz = adiag[i] - adiag[i + 1] - 1;

427:           /* decrement the offset */
428:           offset -= (nz + 1);

430:           /* first, set the diagonal elements */
431:           AjUp[offset] = (PetscInt)i;
432:           AAUp[offset] = (MatScalar)1. / v[nz];
433:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

435:           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
436:           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
437:         }

439:         /* allocate space for the triangular factor information */
440:         PetscCall(PetscNew(&upTriFactor));
441:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

443:         /* Create the matrix description */
444:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
445:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
446: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
447:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
448: #else
449:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
450: #endif
451:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
452:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

454:         /* set the operation */
455:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

457:         /* set the matrix */
458:         upTriFactor->csrMat              = new CsrMatrix;
459:         upTriFactor->csrMat->num_rows    = n;
460:         upTriFactor->csrMat->num_cols    = n;
461:         upTriFactor->csrMat->num_entries = nzUpper;

463:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
464:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

466:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
467:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

469:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
470:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

472:         /* Create the solve analysis information */
473:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
474:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
475: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
476:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
477:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
478:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
479: #endif

481:         /* perform the solve analysis */
482:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
483:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

485:         PetscCallCUDA(WaitForCUDA());
486:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

488:         /* assign the pointer */
489:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
490:         upTriFactor->AA_h                                          = AAUp;
491:         PetscCallCUDA(cudaFreeHost(AiUp));
492:         PetscCallCUDA(cudaFreeHost(AjUp));
493:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
494:       } else {
495:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
496:         /* Fill the upper triangular matrix */
497:         offset = nzUpper;
498:         for (i = n - 1; i >= 0; i--) {
499:           v = aa + adiag[i + 1] + 1;

501:           /* number of elements NOT on the diagonal */
502:           nz = adiag[i] - adiag[i + 1] - 1;

504:           /* decrement the offset */
505:           offset -= (nz + 1);

507:           /* first, set the diagonal elements */
508:           upTriFactor->AA_h[offset] = 1. / v[nz];
509:           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
510:         }
511:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
512:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
513:       }
514:     } catch (char *ex) {
515:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
516:     }
517:   }
518:   PetscFunctionReturn(PETSC_SUCCESS);
519: }

521: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
522: {
523:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
524:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
525:   IS                            isrow = a->row, iscol = a->icol;
526:   PetscBool                     row_identity, col_identity;
527:   PetscInt                      n = A->rmap->n;

529:   PetscFunctionBegin;
530:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
531:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
532:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));

534:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
535:   cusparseTriFactors->nnz = a->nz;

537:   A->offloadmask = PETSC_OFFLOAD_BOTH;
538:   /* lower triangular indices */
539:   PetscCall(ISIdentity(isrow, &row_identity));
540:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
541:     const PetscInt *r;

543:     PetscCall(ISGetIndices(isrow, &r));
544:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545:     cusparseTriFactors->rpermIndices->assign(r, r + n);
546:     PetscCall(ISRestoreIndices(isrow, &r));
547:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548:   }

550:   /* upper triangular indices */
551:   PetscCall(ISIdentity(iscol, &col_identity));
552:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
553:     const PetscInt *c;

555:     PetscCall(ISGetIndices(iscol, &c));
556:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
557:     cusparseTriFactors->cpermIndices->assign(c, c + n);
558:     PetscCall(ISRestoreIndices(iscol, &c));
559:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
560:   }
561:   PetscFunctionReturn(PETSC_SUCCESS);
562: }

564: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
565: {
566:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
567:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
568:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
569:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
570:   PetscInt                          *AiUp, *AjUp;
571:   PetscScalar                       *AAUp;
572:   PetscScalar                       *AALo;
573:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
574:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
575:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
576:   const MatScalar                   *aa = b->a, *v;

578:   PetscFunctionBegin;
579:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
580:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
581:     try {
582:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
583:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
584:       if (!upTriFactor && !loTriFactor) {
585:         /* Allocate Space for the upper triangular matrix */
586:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
587:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

589:         /* Fill the upper triangular matrix */
590:         AiUp[0] = (PetscInt)0;
591:         AiUp[n] = nzUpper;
592:         offset  = 0;
593:         for (i = 0; i < n; i++) {
594:           /* set the pointers */
595:           v  = aa + ai[i];
596:           vj = aj + ai[i];
597:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

599:           /* first, set the diagonal elements */
600:           AjUp[offset] = (PetscInt)i;
601:           AAUp[offset] = (MatScalar)1.0 / v[nz];
602:           AiUp[i]      = offset;
603:           AALo[offset] = (MatScalar)1.0 / v[nz];

605:           offset += 1;
606:           if (nz > 0) {
607:             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
608:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
609:             for (j = offset; j < offset + nz; j++) {
610:               AAUp[j] = -AAUp[j];
611:               AALo[j] = AAUp[j] / v[nz];
612:             }
613:             offset += nz;
614:           }
615:         }

617:         /* allocate space for the triangular factor information */
618:         PetscCall(PetscNew(&upTriFactor));
619:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

621:         /* Create the matrix description */
622:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
623:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
624: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
625:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
626: #else
627:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
628: #endif
629:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
630:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

632:         /* set the matrix */
633:         upTriFactor->csrMat              = new CsrMatrix;
634:         upTriFactor->csrMat->num_rows    = A->rmap->n;
635:         upTriFactor->csrMat->num_cols    = A->cmap->n;
636:         upTriFactor->csrMat->num_entries = a->nz;

638:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
639:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

641:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
642:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

644:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
645:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

647:         /* set the operation */
648:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

650:         /* Create the solve analysis information */
651:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
652:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
653: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
654:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
656:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
657: #endif

659:         /* perform the solve analysis */
660:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
661:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

663:         PetscCallCUDA(WaitForCUDA());
664:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

666:         /* assign the pointer */
667:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

669:         /* allocate space for the triangular factor information */
670:         PetscCall(PetscNew(&loTriFactor));
671:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

673:         /* Create the matrix description */
674:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
675:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
676: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
677:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
678: #else
679:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
680: #endif
681:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
682:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

684:         /* set the operation */
685:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

687:         /* set the matrix */
688:         loTriFactor->csrMat              = new CsrMatrix;
689:         loTriFactor->csrMat->num_rows    = A->rmap->n;
690:         loTriFactor->csrMat->num_cols    = A->cmap->n;
691:         loTriFactor->csrMat->num_entries = a->nz;

693:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
694:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

696:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
697:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

699:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
700:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

702:         /* Create the solve analysis information */
703:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
704:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
705: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
706:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
708:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
709: #endif

711:         /* perform the solve analysis */
712:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
713:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

715:         PetscCallCUDA(WaitForCUDA());
716:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

718:         /* assign the pointer */
719:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

721:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
722:         PetscCallCUDA(cudaFreeHost(AiUp));
723:         PetscCallCUDA(cudaFreeHost(AjUp));
724:       } else {
725:         /* Fill the upper triangular matrix */
726:         offset = 0;
727:         for (i = 0; i < n; i++) {
728:           /* set the pointers */
729:           v  = aa + ai[i];
730:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

732:           /* first, set the diagonal elements */
733:           AAUp[offset] = 1.0 / v[nz];
734:           AALo[offset] = 1.0 / v[nz];

736:           offset += 1;
737:           if (nz > 0) {
738:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739:             for (j = offset; j < offset + nz; j++) {
740:               AAUp[j] = -AAUp[j];
741:               AALo[j] = AAUp[j] / v[nz];
742:             }
743:             offset += nz;
744:           }
745:         }
746:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
747:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
750:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751:       }
752:       PetscCallCUDA(cudaFreeHost(AAUp));
753:       PetscCallCUDA(cudaFreeHost(AALo));
754:     } catch (char *ex) {
755:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
756:     }
757:   }
758:   PetscFunctionReturn(PETSC_SUCCESS);
759: }

761: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
762: {
763:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
764:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
765:   IS                            ip                 = a->row;
766:   PetscBool                     perm_identity;
767:   PetscInt                      n = A->rmap->n;

769:   PetscFunctionBegin;
770:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
771:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
772:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
773:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

775:   A->offloadmask = PETSC_OFFLOAD_BOTH;

777:   /* lower triangular indices */
778:   PetscCall(ISIdentity(ip, &perm_identity));
779:   if (!perm_identity) {
780:     IS              iip;
781:     const PetscInt *irip, *rip;

783:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
784:     PetscCall(ISGetIndices(iip, &irip));
785:     PetscCall(ISGetIndices(ip, &rip));
786:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
787:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
788:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
789:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
790:     PetscCall(ISRestoreIndices(iip, &irip));
791:     PetscCall(ISDestroy(&iip));
792:     PetscCall(ISRestoreIndices(ip, &rip));
793:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
794:   }
795:   PetscFunctionReturn(PETSC_SUCCESS);
796: }

798: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
799: {
800:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
801:   IS          ip = b->row;
802:   PetscBool   perm_identity;

804:   PetscFunctionBegin;
805:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
806:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
807:   B->offloadmask = PETSC_OFFLOAD_CPU;
808:   /* determine which version of MatSolve needs to be used. */
809:   PetscCall(ISIdentity(ip, &perm_identity));
810:   if (perm_identity) {
811:     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
812:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
813:     B->ops->matsolve          = NULL;
814:     B->ops->matsolvetranspose = NULL;
815:   } else {
816:     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
817:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
818:     B->ops->matsolve          = NULL;
819:     B->ops->matsolvetranspose = NULL;
820:   }

822:   /* get the triangular factors */
823:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
824:   PetscFunctionReturn(PETSC_SUCCESS);
825: }

827: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
828: {
829:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
830:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
831:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
832:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
833:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
834:   cusparseIndexBase_t                indexBase;
835:   cusparseMatrixType_t               matrixType;
836:   cusparseFillMode_t                 fillMode;
837:   cusparseDiagType_t                 diagType;

839:   PetscFunctionBegin;
840:   /* allocate space for the transpose of the lower triangular factor */
841:   PetscCall(PetscNew(&loTriFactorT));
842:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

844:   /* set the matrix descriptors of the lower triangular factor */
845:   matrixType = cusparseGetMatType(loTriFactor->descr);
846:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
847:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
848:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

850:   /* Create the matrix description */
851:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
852:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
853:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
854:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
855:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

857:   /* set the operation */
858:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

860:   /* allocate GPU space for the CSC of the lower triangular factor*/
861:   loTriFactorT->csrMat                 = new CsrMatrix;
862:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
863:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
864:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
865:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
866:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
867:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

869:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
870: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
871:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
872:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
873:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
874:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
875: #endif

877:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878:   {
879:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
880:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
881:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
882: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
883:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
884: #else
885:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
886: #endif
887:     PetscCallCUSPARSE(stat);
888:   }

890:   PetscCallCUDA(WaitForCUDA());
891:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

893:   /* Create the solve analysis information */
894:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
896: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
897:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
898:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
899:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
900: #endif

902:   /* perform the solve analysis */
903:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
904:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

906:   PetscCallCUDA(WaitForCUDA());
907:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

909:   /* assign the pointer */
910:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

912:   /*********************************************/
913:   /* Now the Transpose of the Upper Tri Factor */
914:   /*********************************************/

916:   /* allocate space for the transpose of the upper triangular factor */
917:   PetscCall(PetscNew(&upTriFactorT));
918:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

920:   /* set the matrix descriptors of the upper triangular factor */
921:   matrixType = cusparseGetMatType(upTriFactor->descr);
922:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
923:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

926:   /* Create the matrix description */
927:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
928:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
929:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
930:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
931:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

933:   /* set the operation */
934:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

936:   /* allocate GPU space for the CSC of the upper triangular factor*/
937:   upTriFactorT->csrMat                 = new CsrMatrix;
938:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
939:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
940:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
941:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
942:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
943:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

945:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
946: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
947:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
948:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
949:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
950:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
951: #endif

953:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954:   {
955:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
956:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
957:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
958: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
959:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
960: #else
961:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
962: #endif
963:     PetscCallCUSPARSE(stat);
964:   }

966:   PetscCallCUDA(WaitForCUDA());
967:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

969:   /* Create the solve analysis information */
970:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
971:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
972: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
973:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
974:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
975:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
976: #endif

978:   /* perform the solve analysis */
979:   /* christ, would it have killed you to put this stuff in a function????????? */
980:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
981:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

983:   PetscCallCUDA(WaitForCUDA());
984:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

986:   /* assign the pointer */
987:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
988:   PetscFunctionReturn(PETSC_SUCCESS);
989: }

991: struct PetscScalarToPetscInt {
992:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
993: };

995: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
996: {
997:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
998:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
999:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1000:   cusparseStatus_t              stat;
1001:   cusparseIndexBase_t           indexBase;

1003:   PetscFunctionBegin;
1004:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1005:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1006:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1007:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1008:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1009:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1010:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1011:   PetscCall(PetscLogGpuTimeBegin());
1012:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1013:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1014:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1015:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1016:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1017:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1018:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1020:     /* set alpha and beta */
1021:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
1022:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
1023:     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1024:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1025:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1026:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1028:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1029:       CsrMatrix *matrixT      = new CsrMatrix;
1030:       matstructT->mat         = matrixT;
1031:       matrixT->num_rows       = A->cmap->n;
1032:       matrixT->num_cols       = A->rmap->n;
1033:       matrixT->num_entries    = a->nz;
1034:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1035:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1036:       matrixT->values         = new THRUSTARRAY(a->nz);

1038:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1039:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1041: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1042:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1043:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1044:                                indexBase, cusparse_scalartype);
1045:       PetscCallCUSPARSE(stat);
1046:   #else
1047:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1048:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1050:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1051:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1052:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1053:         */
1054:       if (matrixT->num_entries) {
1055:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1056:         PetscCallCUSPARSE(stat);

1058:       } else {
1059:         matstructT->matDescr = NULL;
1060:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1061:       }
1062:   #endif
1063: #endif
1064:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1065: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1066:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1067: #else
1068:       CsrMatrix *temp = new CsrMatrix;
1069:       CsrMatrix *tempT = new CsrMatrix;
1070:       /* First convert HYB to CSR */
1071:       temp->num_rows = A->rmap->n;
1072:       temp->num_cols = A->cmap->n;
1073:       temp->num_entries = a->nz;
1074:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1076:       temp->values = new THRUSTARRAY(a->nz);

1078:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1079:       PetscCallCUSPARSE(stat);

1081:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1082:       tempT->num_rows = A->rmap->n;
1083:       tempT->num_cols = A->cmap->n;
1084:       tempT->num_entries = a->nz;
1085:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1086:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1087:       tempT->values = new THRUSTARRAY(a->nz);

1089:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1090:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1091:       PetscCallCUSPARSE(stat);

1093:       /* Last, convert CSC to HYB */
1094:       cusparseHybMat_t hybMat;
1095:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1096:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1097:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1098:       PetscCallCUSPARSE(stat);

1100:       /* assign the pointer */
1101:       matstructT->mat = hybMat;
1102:       A->transupdated = PETSC_TRUE;
1103:       /* delete temporaries */
1104:       if (tempT) {
1105:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1106:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1107:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1108:         delete (CsrMatrix *)tempT;
1109:       }
1110:       if (temp) {
1111:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1112:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1113:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1114:         delete (CsrMatrix *)temp;
1115:       }
1116: #endif
1117:     }
1118:   }
1119:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1120:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1121:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1122:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1123:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1124:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1125:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1126:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1127:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1128:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1129:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1130:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1131:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1132:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1133:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1134:     }
1135:     if (!cusparsestruct->csr2csc_i) {
1136:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1137:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1139:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1140: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1141:       void  *csr2cscBuffer;
1142:       size_t csr2cscBufferSize;
1143:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1144:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1145:       PetscCallCUSPARSE(stat);
1146:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1147: #endif

1149:       if (matrix->num_entries) {
1150:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1151:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1152:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1154:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1155:            should be filled with indexBase. So I just take a shortcut here.
1156:         */
1157:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1158: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1159:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1160:         PetscCallCUSPARSE(stat);
1161: #else
1162:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1163:         PetscCallCUSPARSE(stat);
1164: #endif
1165:       } else {
1166:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1167:       }

1169:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1170:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1171: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1173: #endif
1174:     }
1175:     PetscCallThrust(
1176:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1177:   }
1178:   PetscCall(PetscLogGpuTimeEnd());
1179:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1180:   /* the compressed row indices is not used for matTranspose */
1181:   matstructT->cprowIndices = NULL;
1182:   /* assign the pointer */
1183:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1184:   A->transupdated                                = PETSC_TRUE;
1185:   PetscFunctionReturn(PETSC_SUCCESS);
1186: }

1188: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1189: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1190: {
1191:   PetscInt                              n = xx->map->n;
1192:   const PetscScalar                    *barray;
1193:   PetscScalar                          *xarray;
1194:   thrust::device_ptr<const PetscScalar> bGPU;
1195:   thrust::device_ptr<PetscScalar>       xGPU;
1196:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1197:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1198:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1199:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1201:   PetscFunctionBegin;
1202:   /* Analyze the matrix and create the transpose ... on the fly */
1203:   if (!loTriFactorT && !upTriFactorT) {
1204:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1205:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1206:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1207:   }

1209:   /* Get the GPU pointers */
1210:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1211:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1212:   xGPU = thrust::device_pointer_cast(xarray);
1213:   bGPU = thrust::device_pointer_cast(barray);

1215:   PetscCall(PetscLogGpuTimeBegin());
1216:   /* First, reorder with the row permutation */
1217:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1219:   /* First, solve U */
1220:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1223:   /* Then, solve L */
1224:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1225:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1227:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1228:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1230:   /* Copy the temporary to the full solution. */
1231:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1233:   /* restore */
1234:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1235:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1236:   PetscCall(PetscLogGpuTimeEnd());
1237:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1238:   PetscFunctionReturn(PETSC_SUCCESS);
1239: }

1241: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1242: {
1243:   const PetscScalar                 *barray;
1244:   PetscScalar                       *xarray;
1245:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1246:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1247:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1248:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1250:   PetscFunctionBegin;
1251:   /* Analyze the matrix and create the transpose ... on the fly */
1252:   if (!loTriFactorT && !upTriFactorT) {
1253:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1254:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1255:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1256:   }

1258:   /* Get the GPU pointers */
1259:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1260:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1262:   PetscCall(PetscLogGpuTimeBegin());
1263:   /* First, solve U */
1264:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1265:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1267:   /* Then, solve L */
1268:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1269:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1271:   /* restore */
1272:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1273:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1274:   PetscCall(PetscLogGpuTimeEnd());
1275:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1276:   PetscFunctionReturn(PETSC_SUCCESS);
1277: }

1279: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1280: {
1281:   const PetscScalar                    *barray;
1282:   PetscScalar                          *xarray;
1283:   thrust::device_ptr<const PetscScalar> bGPU;
1284:   thrust::device_ptr<PetscScalar>       xGPU;
1285:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1286:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1287:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1288:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1290:   PetscFunctionBegin;
1291:   /* Get the GPU pointers */
1292:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1293:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1294:   xGPU = thrust::device_pointer_cast(xarray);
1295:   bGPU = thrust::device_pointer_cast(barray);

1297:   PetscCall(PetscLogGpuTimeBegin());
1298:   /* First, reorder with the row permutation */
1299:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1301:   /* Next, solve L */
1302:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1303:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1305:   /* Then, solve U */
1306:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1307:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1309:   /* Last, reorder with the column permutation */
1310:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1312:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1313:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1314:   PetscCall(PetscLogGpuTimeEnd());
1315:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1316:   PetscFunctionReturn(PETSC_SUCCESS);
1317: }

1319: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1320: {
1321:   const PetscScalar                 *barray;
1322:   PetscScalar                       *xarray;
1323:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1324:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1325:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1326:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1328:   PetscFunctionBegin;
1329:   /* Get the GPU pointers */
1330:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1331:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1333:   PetscCall(PetscLogGpuTimeBegin());
1334:   /* First, solve L */
1335:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1336:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1338:   /* Next, solve U */
1339:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1340:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1342:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1343:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1344:   PetscCall(PetscLogGpuTimeEnd());
1345:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1346:   PetscFunctionReturn(PETSC_SUCCESS);
1347: }

1349: #if CUSPARSE_VERSION >= 11500
1350: /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1351: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1352: {
1353:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1354:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1355:   const PetscScalar            *barray;
1356:   PetscScalar                  *xarray;

1358:   PetscFunctionBegin;
1359:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1360:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1361:   PetscCall(PetscLogGpuTimeBegin());

1363:   /* Solve L*y = b */
1364:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1365:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1366:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1367:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1368:                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!

1370:   /* Solve U*x = y */
1371:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1372:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1373:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));

1375:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1376:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1378:   PetscCall(PetscLogGpuTimeEnd());
1379:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1380:   PetscFunctionReturn(PETSC_SUCCESS);
1381: }

1383: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1384: {
1385:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1386:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1387:   const PetscScalar            *barray;
1388:   PetscScalar                  *xarray;

1390:   PetscFunctionBegin;
1391:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1392:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1393:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1394:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1396:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1397:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1398:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1399:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1400:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1401:   }

1403:   if (!fs->updatedTransposeSpSVAnalysis) {
1404:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1406:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1407:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1408:   }

1410:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1411:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1412:   PetscCall(PetscLogGpuTimeBegin());

1414:   /* Solve Ut*y = b */
1415:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1416:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1417:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1418:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));

1420:   /* Solve Lt*x = y */
1421:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1422:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1423:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1425:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1426:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1427:   PetscCall(PetscLogGpuTimeEnd());
1428:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1429:   PetscFunctionReturn(PETSC_SUCCESS);
1430: }

1432: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1433: {
1434:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1435:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1436:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1437:   CsrMatrix                    *Acsr;
1438:   PetscInt                      m, nz;
1439:   PetscBool                     flg;

1441:   PetscFunctionBegin;
1442:   if (PetscDefined(USE_DEBUG)) {
1443:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1444:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1445:   }

1447:   /* Copy A's value to fact */
1448:   m  = fact->rmap->n;
1449:   nz = aij->nz;
1450:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1451:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1452:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1454:   /* Factorize fact inplace */
1455:   if (m)
1456:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1457:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1458:   if (PetscDefined(USE_DEBUG)) {
1459:     int              numerical_zero;
1460:     cusparseStatus_t status;
1461:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1462:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1463:   }

1465:   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1466:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1467:   */
1468:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1470:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1472:   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1473:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1475:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1476:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1477:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1478:   fact->ops->matsolve          = NULL;
1479:   fact->ops->matsolvetranspose = NULL;
1480:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1481:   PetscFunctionReturn(PETSC_SUCCESS);
1482: }

1484: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1485: {
1486:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1487:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1488:   PetscInt                      m, nz;

1490:   PetscFunctionBegin;
1491:   if (PetscDefined(USE_DEBUG)) {
1492:     PetscInt  i;
1493:     PetscBool flg, missing;

1495:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1496:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1497:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1498:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1499:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1500:   }

1502:   /* Free the old stale stuff */
1503:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1505:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1506:      but they will not be used. Allocate them just for easy debugging.
1507:    */
1508:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1510:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1511:   fact->factortype             = MAT_FACTOR_ILU;
1512:   fact->info.factor_mallocs    = 0;
1513:   fact->info.fill_ratio_given  = info->fill;
1514:   fact->info.fill_ratio_needed = 1.0;

1516:   aij->row = NULL;
1517:   aij->col = NULL;

1519:   /* ====================================================================== */
1520:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1521:   /* We'll do in-place factorization on fact                                */
1522:   /* ====================================================================== */
1523:   const int *Ai, *Aj;

1525:   m  = fact->rmap->n;
1526:   nz = aij->nz;

1528:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1529:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1530:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1531:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1532:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1533:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1535:   /* ====================================================================== */
1536:   /* Create descriptors for M, L, U                                         */
1537:   /* ====================================================================== */
1538:   cusparseFillMode_t fillMode;
1539:   cusparseDiagType_t diagType;

1541:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1542:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1543:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1545:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1546:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1547:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1548:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1549:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1550:   */
1551:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1552:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1553:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1554:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1555:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1557:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1558:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1559:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1560:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1561:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1563:   /* ========================================================================= */
1564:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1565:   /* ========================================================================= */
1566:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1567:   if (m)
1568:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1569:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1571:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1572:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1574:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1575:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1577:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1578:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1580:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1581:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1583:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1584:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1585:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1586:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1587:    */
1588:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1589:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1590:     fs->spsvBuffer_L = fs->factBuffer_M;
1591:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1592:   } else {
1593:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1594:     fs->spsvBuffer_U = fs->factBuffer_M;
1595:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1596:   }

1598:   /* ========================================================================== */
1599:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1600:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1601:   /* ========================================================================== */
1602:   int              structural_zero;
1603:   cusparseStatus_t status;

1605:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1606:   if (m)
1607:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1608:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1609:   if (PetscDefined(USE_DEBUG)) {
1610:     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1611:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1612:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1613:   }

1615:   /* Estimate FLOPs of the numeric factorization */
1616:   {
1617:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1618:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1619:     PetscLogDouble flops = 0.0;

1621:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1622:     Ai    = Aseq->i;
1623:     Adiag = Aseq->diag;
1624:     for (PetscInt i = 0; i < m; i++) {
1625:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1626:         nzRow  = Ai[i + 1] - Ai[i];
1627:         nzLeft = Adiag[i] - Ai[i];
1628:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1629:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1630:         */
1631:         nzLeft = (nzRow - 1) / 2;
1632:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1633:       }
1634:     }
1635:     fs->numericFactFlops = flops;
1636:   }
1637:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1638:   PetscFunctionReturn(PETSC_SUCCESS);
1639: }

1641: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1642: {
1643:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1644:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1645:   const PetscScalar            *barray;
1646:   PetscScalar                  *xarray;

1648:   PetscFunctionBegin;
1649:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1650:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1651:   PetscCall(PetscLogGpuTimeBegin());

1653:   /* Solve L*y = b */
1654:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1655:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1656:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1657:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1659:   /* Solve Lt*x = y */
1660:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1661:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1662:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1664:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1665:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1667:   PetscCall(PetscLogGpuTimeEnd());
1668:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1669:   PetscFunctionReturn(PETSC_SUCCESS);
1670: }

1672: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1673: {
1674:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1675:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1676:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1677:   CsrMatrix                    *Acsr;
1678:   PetscInt                      m, nz;
1679:   PetscBool                     flg;

1681:   PetscFunctionBegin;
1682:   if (PetscDefined(USE_DEBUG)) {
1683:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1684:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1685:   }

1687:   /* Copy A's value to fact */
1688:   m  = fact->rmap->n;
1689:   nz = aij->nz;
1690:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1691:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1692:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1694:   /* Factorize fact inplace */
1695:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1696:      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1697:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1698:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1699:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1700:    */
1701:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1702:   if (PetscDefined(USE_DEBUG)) {
1703:     int              numerical_zero;
1704:     cusparseStatus_t status;
1705:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1706:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1707:   }

1709:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1711:   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1712:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1713:   */
1714:   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1716:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1717:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1718:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1719:   fact->ops->matsolve          = NULL;
1720:   fact->ops->matsolvetranspose = NULL;
1721:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1722:   PetscFunctionReturn(PETSC_SUCCESS);
1723: }

1725: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1726: {
1727:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1728:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1729:   PetscInt                      m, nz;

1731:   PetscFunctionBegin;
1732:   if (PetscDefined(USE_DEBUG)) {
1733:     PetscInt  i;
1734:     PetscBool flg, missing;

1736:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1737:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1738:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1739:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1740:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1741:   }

1743:   /* Free the old stale stuff */
1744:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1746:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1747:      but they will not be used. Allocate them just for easy debugging.
1748:    */
1749:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1751:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1752:   fact->factortype             = MAT_FACTOR_ICC;
1753:   fact->info.factor_mallocs    = 0;
1754:   fact->info.fill_ratio_given  = info->fill;
1755:   fact->info.fill_ratio_needed = 1.0;

1757:   aij->row = NULL;
1758:   aij->col = NULL;

1760:   /* ====================================================================== */
1761:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1762:   /* We'll do in-place factorization on fact                                */
1763:   /* ====================================================================== */
1764:   const int *Ai, *Aj;

1766:   m  = fact->rmap->n;
1767:   nz = aij->nz;

1769:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1770:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1771:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1772:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1773:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1774:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1776:   /* ====================================================================== */
1777:   /* Create mat descriptors for M, L                                        */
1778:   /* ====================================================================== */
1779:   cusparseFillMode_t fillMode;
1780:   cusparseDiagType_t diagType;

1782:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1783:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1784:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1786:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1787:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1788:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1789:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1790:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1791:   */
1792:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1793:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1794:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1795:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1796:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1798:   /* ========================================================================= */
1799:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1800:   /* ========================================================================= */
1801:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1802:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));

1804:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1805:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1807:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1808:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1810:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1811:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1813:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1814:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1816:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1817:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1818:    */
1819:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1820:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1821:     fs->spsvBuffer_L = fs->factBuffer_M;
1822:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1823:   } else {
1824:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1825:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1826:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1827:   }

1829:   /* ========================================================================== */
1830:   /* Perform analysis of ic0 on M                                               */
1831:   /* The lower triangular part of M has the same sparsity pattern as L          */
1832:   /* ========================================================================== */
1833:   int              structural_zero;
1834:   cusparseStatus_t status;

1836:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1837:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1838:   if (PetscDefined(USE_DEBUG)) {
1839:     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1840:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1841:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1842:   }

1844:   /* Estimate FLOPs of the numeric factorization */
1845:   {
1846:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1847:     PetscInt      *Ai, nzRow, nzLeft;
1848:     PetscLogDouble flops = 0.0;

1850:     Ai = Aseq->i;
1851:     for (PetscInt i = 0; i < m; i++) {
1852:       nzRow = Ai[i + 1] - Ai[i];
1853:       if (nzRow > 1) {
1854:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1855:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1856:         */
1857:         nzLeft = (nzRow - 1) / 2;
1858:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1859:       }
1860:     }
1861:     fs->numericFactFlops = flops;
1862:   }
1863:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1864:   PetscFunctionReturn(PETSC_SUCCESS);
1865: }
1866: #endif

1868: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1869: {
1870:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1872:   PetscFunctionBegin;
1873: #if CUSPARSE_VERSION >= 11500
1874:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1875:   if (cusparseTriFactors->factorizeOnDevice) {
1876:     PetscCall(ISIdentity(isrow, &row_identity));
1877:     PetscCall(ISIdentity(iscol, &col_identity));
1878:   }
1879:   if (!info->levels && row_identity && col_identity) {
1880:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1881:   } else
1882: #endif
1883:   {
1884:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1885:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1886:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1887:   }
1888:   PetscFunctionReturn(PETSC_SUCCESS);
1889: }

1891: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892: {
1893:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1895:   PetscFunctionBegin;
1896:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1897:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1899:   PetscFunctionReturn(PETSC_SUCCESS);
1900: }

1902: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1903: {
1904:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1906:   PetscFunctionBegin;
1907: #if CUSPARSE_VERSION >= 11500
1908:   PetscBool perm_identity = PETSC_FALSE;
1909:   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1910:   if (!info->levels && perm_identity) {
1911:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1912:   } else
1913: #endif
1914:   {
1915:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1916:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1917:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1918:   }
1919:   PetscFunctionReturn(PETSC_SUCCESS);
1920: }

1922: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1923: {
1924:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1926:   PetscFunctionBegin;
1927:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1928:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1929:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1930:   PetscFunctionReturn(PETSC_SUCCESS);
1931: }

1933: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1934: {
1935:   PetscFunctionBegin;
1936:   *type = MATSOLVERCUSPARSE;
1937:   PetscFunctionReturn(PETSC_SUCCESS);
1938: }

1940: /*MC
1941:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1942:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1943:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1944:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1945:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1946:   algorithms are not recommended. This class does NOT support direct solver operations.

1948:   Level: beginner

1950: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1951:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1952: M*/

1954: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1955: {
1956:   PetscInt  n = A->rmap->n;
1957:   PetscBool factOnDevice, factOnHost;
1958:   char     *prefix;
1959:   char      factPlace[32] = "device"; /* the default */

1961:   PetscFunctionBegin;
1962:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1963:   PetscCall(MatSetSizes(*B, n, n, n, n));
1964:   (*B)->factortype = ftype;
1965:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

1967:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1968:   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1969:   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1970:   PetscOptionsEnd();
1971:   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1972:   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1973:   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1974:   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

1976:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1977:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1978:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1979:     if (!A->boundtocpu) {
1980:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1981:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1982:     } else {
1983:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1984:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1985:     }
1986:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1987:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1988:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1989:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1990:     if (!A->boundtocpu) {
1991:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1992:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1993:     } else {
1994:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1995:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1996:     }
1997:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1998:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1999:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2001:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2002:   (*B)->canuseordering = PETSC_TRUE;
2003:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2004:   PetscFunctionReturn(PETSC_SUCCESS);
2005: }

2007: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2008: {
2009:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2010:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2011: #if CUSPARSE_VERSION >= 13500
2012:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2013: #endif

2015:   PetscFunctionBegin;
2016:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2017:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2018:     if (A->factortype == MAT_FACTOR_NONE) {
2019:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2020:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2021:     }
2022: #if CUSPARSE_VERSION >= 13500
2023:     else if (fs->csrVal) {
2024:       /* We have a factorized matrix on device and are able to copy it to host */
2025:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2026:     }
2027: #endif
2028:     else
2029:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2030:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2031:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2032:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2033:   }
2034:   PetscFunctionReturn(PETSC_SUCCESS);
2035: }

2037: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2038: {
2039:   PetscFunctionBegin;
2040:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2041:   *array = ((Mat_SeqAIJ *)A->data)->a;
2042:   PetscFunctionReturn(PETSC_SUCCESS);
2043: }

2045: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2046: {
2047:   PetscFunctionBegin;
2048:   A->offloadmask = PETSC_OFFLOAD_CPU;
2049:   *array         = NULL;
2050:   PetscFunctionReturn(PETSC_SUCCESS);
2051: }

2053: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2054: {
2055:   PetscFunctionBegin;
2056:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2057:   *array = ((Mat_SeqAIJ *)A->data)->a;
2058:   PetscFunctionReturn(PETSC_SUCCESS);
2059: }

2061: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2062: {
2063:   PetscFunctionBegin;
2064:   *array = NULL;
2065:   PetscFunctionReturn(PETSC_SUCCESS);
2066: }

2068: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2069: {
2070:   PetscFunctionBegin;
2071:   *array = ((Mat_SeqAIJ *)A->data)->a;
2072:   PetscFunctionReturn(PETSC_SUCCESS);
2073: }

2075: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2076: {
2077:   PetscFunctionBegin;
2078:   A->offloadmask = PETSC_OFFLOAD_CPU;
2079:   *array         = NULL;
2080:   PetscFunctionReturn(PETSC_SUCCESS);
2081: }

2083: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2084: {
2085:   Mat_SeqAIJCUSPARSE *cusp;
2086:   CsrMatrix          *matrix;

2088:   PetscFunctionBegin;
2089:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2090:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2091:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2092:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2093:   matrix = (CsrMatrix *)cusp->mat->mat;

2095:   if (i) {
2096: #if !defined(PETSC_USE_64BIT_INDICES)
2097:     *i = matrix->row_offsets->data().get();
2098: #else
2099:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2100: #endif
2101:   }
2102:   if (j) {
2103: #if !defined(PETSC_USE_64BIT_INDICES)
2104:     *j = matrix->column_indices->data().get();
2105: #else
2106:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2107: #endif
2108:   }
2109:   if (a) *a = matrix->values->data().get();
2110:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2111:   PetscFunctionReturn(PETSC_SUCCESS);
2112: }

2114: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2115: {
2116:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2117:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2118:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2119:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2120:   cusparseStatus_t              stat;
2121:   PetscBool                     both = PETSC_TRUE;

2123:   PetscFunctionBegin;
2124:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2125:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2126:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2127:       CsrMatrix *matrix;
2128:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2130:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2131:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2132:       matrix->values->assign(a->a, a->a + a->nz);
2133:       PetscCallCUDA(WaitForCUDA());
2134:       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2135:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2137:     } else {
2138:       PetscInt nnz;
2139:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2140:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2141:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2142:       delete cusparsestruct->workVector;
2143:       delete cusparsestruct->rowoffsets_gpu;
2144:       cusparsestruct->workVector     = NULL;
2145:       cusparsestruct->rowoffsets_gpu = NULL;
2146:       try {
2147:         if (a->compressedrow.use) {
2148:           m    = a->compressedrow.nrows;
2149:           ii   = a->compressedrow.i;
2150:           ridx = a->compressedrow.rindex;
2151:         } else {
2152:           m    = A->rmap->n;
2153:           ii   = a->i;
2154:           ridx = NULL;
2155:         }
2156:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2157:         if (!a->a) {
2158:           nnz  = ii[m];
2159:           both = PETSC_FALSE;
2160:         } else nnz = a->nz;
2161:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2163:         /* create cusparse matrix */
2164:         cusparsestruct->nrows = m;
2165:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2166:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2167:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2168:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2170:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2171:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2172:         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2173:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2174:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2175:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2176:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2178:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2179:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2180:           /* set the matrix */
2181:           CsrMatrix *mat   = new CsrMatrix;
2182:           mat->num_rows    = m;
2183:           mat->num_cols    = A->cmap->n;
2184:           mat->num_entries = nnz;
2185:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2186:           mat->row_offsets->assign(ii, ii + m + 1);

2188:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2189:           mat->column_indices->assign(a->j, a->j + nnz);

2191:           mat->values = new THRUSTARRAY(nnz);
2192:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2194:           /* assign the pointer */
2195:           matstruct->mat = mat;
2196: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2197:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2198:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2199:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2200:             PetscCallCUSPARSE(stat);
2201:           }
2202: #endif
2203:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2204: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2205:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2206: #else
2207:           CsrMatrix *mat = new CsrMatrix;
2208:           mat->num_rows = m;
2209:           mat->num_cols = A->cmap->n;
2210:           mat->num_entries = nnz;
2211:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2212:           mat->row_offsets->assign(ii, ii + m + 1);

2214:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2215:           mat->column_indices->assign(a->j, a->j + nnz);

2217:           mat->values = new THRUSTARRAY(nnz);
2218:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2220:           cusparseHybMat_t hybMat;
2221:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2222:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2223:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2224:           PetscCallCUSPARSE(stat);
2225:           /* assign the pointer */
2226:           matstruct->mat = hybMat;

2228:           if (mat) {
2229:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2230:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2231:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2232:             delete (CsrMatrix *)mat;
2233:           }
2234: #endif
2235:         }

2237:         /* assign the compressed row indices */
2238:         if (a->compressedrow.use) {
2239:           cusparsestruct->workVector = new THRUSTARRAY(m);
2240:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2241:           matstruct->cprowIndices->assign(ridx, ridx + m);
2242:           tmp = m;
2243:         } else {
2244:           cusparsestruct->workVector = NULL;
2245:           matstruct->cprowIndices    = NULL;
2246:           tmp                        = 0;
2247:         }
2248:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2250:         /* assign the pointer */
2251:         cusparsestruct->mat = matstruct;
2252:       } catch (char *ex) {
2253:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2254:       }
2255:       PetscCallCUDA(WaitForCUDA());
2256:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2257:       cusparsestruct->nonzerostate = A->nonzerostate;
2258:     }
2259:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2260:   }
2261:   PetscFunctionReturn(PETSC_SUCCESS);
2262: }

2264: struct VecCUDAPlusEquals {
2265:   template <typename Tuple>
2266:   __host__ __device__ void operator()(Tuple t)
2267:   {
2268:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2269:   }
2270: };

2272: struct VecCUDAEquals {
2273:   template <typename Tuple>
2274:   __host__ __device__ void operator()(Tuple t)
2275:   {
2276:     thrust::get<1>(t) = thrust::get<0>(t);
2277:   }
2278: };

2280: struct VecCUDAEqualsReverse {
2281:   template <typename Tuple>
2282:   __host__ __device__ void operator()(Tuple t)
2283:   {
2284:     thrust::get<0>(t) = thrust::get<1>(t);
2285:   }
2286: };

2288: struct MatMatCusparse {
2289:   PetscBool      cisdense;
2290:   PetscScalar   *Bt;
2291:   Mat            X;
2292:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2293:   PetscLogDouble flops;
2294:   CsrMatrix     *Bcsr;

2296: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2297:   cusparseSpMatDescr_t matSpBDescr;
2298:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2299:   cusparseDnMatDescr_t matBDescr;
2300:   cusparseDnMatDescr_t matCDescr;
2301:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2302:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2303:   void *dBuffer4;
2304:   void *dBuffer5;
2305:   #endif
2306:   size_t                mmBufferSize;
2307:   void                 *mmBuffer;
2308:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2309:   cusparseSpGEMMDescr_t spgemmDesc;
2310: #endif
2311: };

2313: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2314: {
2315:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2317:   PetscFunctionBegin;
2318:   PetscCallCUDA(cudaFree(mmdata->Bt));
2319:   delete mmdata->Bcsr;
2320: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2321:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2322:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2323:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2324:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2325:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2326:   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2327:   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2328:   #endif
2329:   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2330:   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2331: #endif
2332:   PetscCall(MatDestroy(&mmdata->X));
2333:   PetscCall(PetscFree(data));
2334:   PetscFunctionReturn(PETSC_SUCCESS);
2335: }

2337: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);

2339: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2340: {
2341:   Mat_Product                  *product = C->product;
2342:   Mat                           A, B;
2343:   PetscInt                      m, n, blda, clda;
2344:   PetscBool                     flg, biscuda;
2345:   Mat_SeqAIJCUSPARSE           *cusp;
2346:   cusparseStatus_t              stat;
2347:   cusparseOperation_t           opA;
2348:   const PetscScalar            *barray;
2349:   PetscScalar                  *carray;
2350:   MatMatCusparse               *mmdata;
2351:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2352:   CsrMatrix                    *csrmat;

2354:   PetscFunctionBegin;
2355:   MatCheckProduct(C, 1);
2356:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2357:   mmdata = (MatMatCusparse *)product->data;
2358:   A      = product->A;
2359:   B      = product->B;
2360:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2361:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2362:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2363:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2364:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2365:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2366:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2367:   switch (product->type) {
2368:   case MATPRODUCT_AB:
2369:   case MATPRODUCT_PtAP:
2370:     mat = cusp->mat;
2371:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2372:     m   = A->rmap->n;
2373:     n   = B->cmap->n;
2374:     break;
2375:   case MATPRODUCT_AtB:
2376:     if (!A->form_explicit_transpose) {
2377:       mat = cusp->mat;
2378:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2379:     } else {
2380:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2381:       mat = cusp->matTranspose;
2382:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2383:     }
2384:     m = A->cmap->n;
2385:     n = B->cmap->n;
2386:     break;
2387:   case MATPRODUCT_ABt:
2388:   case MATPRODUCT_RARt:
2389:     mat = cusp->mat;
2390:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2391:     m   = A->rmap->n;
2392:     n   = B->rmap->n;
2393:     break;
2394:   default:
2395:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2396:   }
2397:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2398:   csrmat = (CsrMatrix *)mat->mat;
2399:   /* if the user passed a CPU matrix, copy the data to the GPU */
2400:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2401:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2402:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2404:   PetscCall(MatDenseGetLDA(B, &blda));
2405:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2406:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2407:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2408:   } else {
2409:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2410:     PetscCall(MatDenseGetLDA(C, &clda));
2411:   }

2413:   PetscCall(PetscLogGpuTimeBegin());
2414: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2415:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2416:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2417:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2418:     size_t mmBufferSize;
2419:     if (mmdata->initialized && mmdata->Blda != blda) {
2420:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2421:       mmdata->matBDescr = NULL;
2422:     }
2423:     if (!mmdata->matBDescr) {
2424:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2425:       mmdata->Blda = blda;
2426:     }

2428:     if (mmdata->initialized && mmdata->Clda != clda) {
2429:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2430:       mmdata->matCDescr = NULL;
2431:     }
2432:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2433:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2434:       mmdata->Clda = clda;
2435:     }

2437:     if (!mat->matDescr) {
2438:       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2439:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2440:       PetscCallCUSPARSE(stat);
2441:     }
2442:     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2443:     PetscCallCUSPARSE(stat);
2444:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2445:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2446:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2447:       mmdata->mmBufferSize = mmBufferSize;
2448:     }
2449:     mmdata->initialized = PETSC_TRUE;
2450:   } else {
2451:     /* to be safe, always update pointers of the mats */
2452:     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2453:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2454:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2455:   }

2457:   /* do cusparseSpMM, which supports transpose on B */
2458:   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2459:   PetscCallCUSPARSE(stat);
2460: #else
2461:   PetscInt k;
2462:   /* cusparseXcsrmm does not support transpose on B */
2463:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2464:     cublasHandle_t cublasv2handle;
2465:     cublasStatus_t cerr;

2467:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2468:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2469:     PetscCallCUBLAS(cerr);
2470:     blda = B->cmap->n;
2471:     k = B->cmap->n;
2472:   } else {
2473:     k = B->rmap->n;
2474:   }

2476:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2477:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2478:   PetscCallCUSPARSE(stat);
2479: #endif
2480:   PetscCall(PetscLogGpuTimeEnd());
2481:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2482:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2483:   if (product->type == MATPRODUCT_RARt) {
2484:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2485:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2486:   } else if (product->type == MATPRODUCT_PtAP) {
2487:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2488:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2489:   } else {
2490:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2491:   }
2492:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2493:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2494:   PetscFunctionReturn(PETSC_SUCCESS);
2495: }

2497: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2498: {
2499:   Mat_Product        *product = C->product;
2500:   Mat                 A, B;
2501:   PetscInt            m, n;
2502:   PetscBool           cisdense, flg;
2503:   MatMatCusparse     *mmdata;
2504:   Mat_SeqAIJCUSPARSE *cusp;

2506:   PetscFunctionBegin;
2507:   MatCheckProduct(C, 1);
2508:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2509:   A = product->A;
2510:   B = product->B;
2511:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2512:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2513:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2514:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2515:   switch (product->type) {
2516:   case MATPRODUCT_AB:
2517:     m = A->rmap->n;
2518:     n = B->cmap->n;
2519:     break;
2520:   case MATPRODUCT_AtB:
2521:     m = A->cmap->n;
2522:     n = B->cmap->n;
2523:     break;
2524:   case MATPRODUCT_ABt:
2525:     m = A->rmap->n;
2526:     n = B->rmap->n;
2527:     break;
2528:   case MATPRODUCT_PtAP:
2529:     m = B->cmap->n;
2530:     n = B->cmap->n;
2531:     break;
2532:   case MATPRODUCT_RARt:
2533:     m = B->rmap->n;
2534:     n = B->rmap->n;
2535:     break;
2536:   default:
2537:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2538:   }
2539:   PetscCall(MatSetSizes(C, m, n, m, n));
2540:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2541:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2542:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2544:   /* product data */
2545:   PetscCall(PetscNew(&mmdata));
2546:   mmdata->cisdense = cisdense;
2547: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2548:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2549:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2550: #endif
2551:   /* for these products we need intermediate storage */
2552:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2553:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2554:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2555:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2556:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2557:     } else {
2558:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2559:     }
2560:   }
2561:   C->product->data    = mmdata;
2562:   C->product->destroy = MatDestroy_MatMatCusparse;

2564:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2565:   PetscFunctionReturn(PETSC_SUCCESS);
2566: }

2568: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2569: {
2570:   Mat_Product                  *product = C->product;
2571:   Mat                           A, B;
2572:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2573:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2574:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2575:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2576:   PetscBool                     flg;
2577:   cusparseStatus_t              stat;
2578:   MatProductType                ptype;
2579:   MatMatCusparse               *mmdata;
2580: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2581:   cusparseSpMatDescr_t BmatSpDescr;
2582: #endif
2583:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2585:   PetscFunctionBegin;
2586:   MatCheckProduct(C, 1);
2587:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2588:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2589:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2590:   mmdata = (MatMatCusparse *)C->product->data;
2591:   A      = product->A;
2592:   B      = product->B;
2593:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2594:     mmdata->reusesym = PETSC_FALSE;
2595:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2596:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2597:     Cmat = Ccusp->mat;
2598:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2599:     Ccsr = (CsrMatrix *)Cmat->mat;
2600:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2601:     goto finalize;
2602:   }
2603:   if (!c->nz) goto finalize;
2604:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2605:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2606:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2607:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2608:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2609:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2610:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2611:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2612:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2613:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2614:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2615:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2616:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2617:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2619:   ptype = product->type;
2620:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2621:     ptype = MATPRODUCT_AB;
2622:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2623:   }
2624:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2625:     ptype = MATPRODUCT_AB;
2626:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2627:   }
2628:   switch (ptype) {
2629:   case MATPRODUCT_AB:
2630:     Amat = Acusp->mat;
2631:     Bmat = Bcusp->mat;
2632:     break;
2633:   case MATPRODUCT_AtB:
2634:     Amat = Acusp->matTranspose;
2635:     Bmat = Bcusp->mat;
2636:     break;
2637:   case MATPRODUCT_ABt:
2638:     Amat = Acusp->mat;
2639:     Bmat = Bcusp->matTranspose;
2640:     break;
2641:   default:
2642:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2643:   }
2644:   Cmat = Ccusp->mat;
2645:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2646:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2647:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2648:   Acsr = (CsrMatrix *)Amat->mat;
2649:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2650:   Ccsr = (CsrMatrix *)Cmat->mat;
2651:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2652:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2653:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2654:   PetscCall(PetscLogGpuTimeBegin());
2655: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2656:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2657:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2658:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2659:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2660:   PetscCallCUSPARSE(stat);
2661:   #else
2662:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2663:   PetscCallCUSPARSE(stat);
2664:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2665:   PetscCallCUSPARSE(stat);
2666:   #endif
2667: #else
2668:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2669:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2670:   PetscCallCUSPARSE(stat);
2671: #endif
2672:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2673:   PetscCallCUDA(WaitForCUDA());
2674:   PetscCall(PetscLogGpuTimeEnd());
2675:   C->offloadmask = PETSC_OFFLOAD_GPU;
2676: finalize:
2677:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2678:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2679:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2680:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2681:   c->reallocs = 0;
2682:   C->info.mallocs += 0;
2683:   C->info.nz_unneeded = 0;
2684:   C->assembled = C->was_assembled = PETSC_TRUE;
2685:   C->num_ass++;
2686:   PetscFunctionReturn(PETSC_SUCCESS);
2687: }

2689: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2690: {
2691:   Mat_Product                  *product = C->product;
2692:   Mat                           A, B;
2693:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2694:   Mat_SeqAIJ                   *a, *b, *c;
2695:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2696:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2697:   PetscInt                      i, j, m, n, k;
2698:   PetscBool                     flg;
2699:   cusparseStatus_t              stat;
2700:   MatProductType                ptype;
2701:   MatMatCusparse               *mmdata;
2702:   PetscLogDouble                flops;
2703:   PetscBool                     biscompressed, ciscompressed;
2704: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2705:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2706:   cusparseSpMatDescr_t BmatSpDescr;
2707: #else
2708:   int cnz;
2709: #endif
2710:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2712:   PetscFunctionBegin;
2713:   MatCheckProduct(C, 1);
2714:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2715:   A = product->A;
2716:   B = product->B;
2717:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2718:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2719:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2720:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2721:   a = (Mat_SeqAIJ *)A->data;
2722:   b = (Mat_SeqAIJ *)B->data;
2723:   /* product data */
2724:   PetscCall(PetscNew(&mmdata));
2725:   C->product->data    = mmdata;
2726:   C->product->destroy = MatDestroy_MatMatCusparse;

2728:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2729:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2730:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2731:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2732:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2733:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

2735:   ptype = product->type;
2736:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2737:     ptype                                          = MATPRODUCT_AB;
2738:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2739:   }
2740:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2741:     ptype                                          = MATPRODUCT_AB;
2742:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2743:   }
2744:   biscompressed = PETSC_FALSE;
2745:   ciscompressed = PETSC_FALSE;
2746:   switch (ptype) {
2747:   case MATPRODUCT_AB:
2748:     m    = A->rmap->n;
2749:     n    = B->cmap->n;
2750:     k    = A->cmap->n;
2751:     Amat = Acusp->mat;
2752:     Bmat = Bcusp->mat;
2753:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2754:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2755:     break;
2756:   case MATPRODUCT_AtB:
2757:     m = A->cmap->n;
2758:     n = B->cmap->n;
2759:     k = A->rmap->n;
2760:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2761:     Amat = Acusp->matTranspose;
2762:     Bmat = Bcusp->mat;
2763:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2764:     break;
2765:   case MATPRODUCT_ABt:
2766:     m = A->rmap->n;
2767:     n = B->rmap->n;
2768:     k = A->cmap->n;
2769:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2770:     Amat = Acusp->mat;
2771:     Bmat = Bcusp->matTranspose;
2772:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2773:     break;
2774:   default:
2775:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2776:   }

2778:   /* create cusparse matrix */
2779:   PetscCall(MatSetSizes(C, m, n, m, n));
2780:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2781:   c     = (Mat_SeqAIJ *)C->data;
2782:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2783:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2784:   Ccsr  = new CsrMatrix;

2786:   c->compressedrow.use = ciscompressed;
2787:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2788:     c->compressedrow.nrows = a->compressedrow.nrows;
2789:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2790:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2791:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2792:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2793:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2794:   } else {
2795:     c->compressedrow.nrows  = 0;
2796:     c->compressedrow.i      = NULL;
2797:     c->compressedrow.rindex = NULL;
2798:     Ccusp->workVector       = NULL;
2799:     Cmat->cprowIndices      = NULL;
2800:   }
2801:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2802:   Ccusp->mat        = Cmat;
2803:   Ccusp->mat->mat   = Ccsr;
2804:   Ccsr->num_rows    = Ccusp->nrows;
2805:   Ccsr->num_cols    = n;
2806:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2807:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2808:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2809:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2810:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2811:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2812:   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2813:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2814:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2815:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2816:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2817:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2818:     c->nz                = 0;
2819:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2820:     Ccsr->values         = new THRUSTARRAY(c->nz);
2821:     goto finalizesym;
2822:   }

2824:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2825:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2826:   Acsr = (CsrMatrix *)Amat->mat;
2827:   if (!biscompressed) {
2828:     Bcsr = (CsrMatrix *)Bmat->mat;
2829: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2830:     BmatSpDescr = Bmat->matDescr;
2831: #endif
2832:   } else { /* we need to use row offsets for the full matrix */
2833:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2834:     Bcsr                 = new CsrMatrix;
2835:     Bcsr->num_rows       = B->rmap->n;
2836:     Bcsr->num_cols       = cBcsr->num_cols;
2837:     Bcsr->num_entries    = cBcsr->num_entries;
2838:     Bcsr->column_indices = cBcsr->column_indices;
2839:     Bcsr->values         = cBcsr->values;
2840:     if (!Bcusp->rowoffsets_gpu) {
2841:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2842:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2843:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2844:     }
2845:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2846:     mmdata->Bcsr      = Bcsr;
2847: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2848:     if (Bcsr->num_rows && Bcsr->num_cols) {
2849:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2850:       PetscCallCUSPARSE(stat);
2851:     }
2852:     BmatSpDescr = mmdata->matSpBDescr;
2853: #endif
2854:   }
2855:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2856:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2857:   /* precompute flops count */
2858:   if (ptype == MATPRODUCT_AB) {
2859:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2860:       const PetscInt st = a->i[i];
2861:       const PetscInt en = a->i[i + 1];
2862:       for (j = st; j < en; j++) {
2863:         const PetscInt brow = a->j[j];
2864:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2865:       }
2866:     }
2867:   } else if (ptype == MATPRODUCT_AtB) {
2868:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2869:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2870:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2871:       flops += (2. * anzi) * bnzi;
2872:     }
2873:   } else { /* TODO */
2874:     flops = 0.;
2875:   }

2877:   mmdata->flops = flops;
2878:   PetscCall(PetscLogGpuTimeBegin());

2880: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2881:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2882:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2883:   PetscCallCUSPARSE(stat);
2884:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2885:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2886:   {
2887:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2888:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2889:   */
2890:     void *dBuffer1 = NULL;
2891:     void *dBuffer2 = NULL;
2892:     void *dBuffer3 = NULL;
2893:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2894:     size_t bufferSize1 = 0;
2895:     size_t bufferSize2 = 0;
2896:     size_t bufferSize3 = 0;
2897:     size_t bufferSize4 = 0;
2898:     size_t bufferSize5 = 0;

2900:     /* ask bufferSize1 bytes for external memory */
2901:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2902:     PetscCallCUSPARSE(stat);
2903:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2904:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2905:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2906:     PetscCallCUSPARSE(stat);

2908:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2909:     PetscCallCUSPARSE(stat);
2910:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2911:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2912:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2913:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2914:     PetscCallCUSPARSE(stat);
2915:     PetscCallCUDA(cudaFree(dBuffer1));
2916:     PetscCallCUDA(cudaFree(dBuffer2));

2918:     /* get matrix C non-zero entries C_nnz1 */
2919:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2920:     c->nz = (PetscInt)C_nnz1;
2921:     /* allocate matrix C */
2922:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2923:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2924:     Ccsr->values = new THRUSTARRAY(c->nz);
2925:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2926:     /* update matC with the new pointers */
2927:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2928:     PetscCallCUSPARSE(stat);

2930:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2931:     PetscCallCUSPARSE(stat);
2932:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2933:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2934:     PetscCallCUSPARSE(stat);
2935:     PetscCallCUDA(cudaFree(dBuffer3));
2936:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2937:     PetscCallCUSPARSE(stat);
2938:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2939:   }
2940:   #else
2941:   size_t bufSize2;
2942:   /* ask bufferSize bytes for external memory */
2943:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2944:   PetscCallCUSPARSE(stat);
2945:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2946:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2947:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2948:   PetscCallCUSPARSE(stat);
2949:   /* ask bufferSize again bytes for external memory */
2950:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2951:   PetscCallCUSPARSE(stat);
2952:   /* The CUSPARSE documentation is not clear, nor the API
2953:      We need both buffers to perform the operations properly!
2954:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2955:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2956:      is stored in the descriptor! What a messy API... */
2957:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2958:   /* compute the intermediate product of A * B */
2959:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2960:   PetscCallCUSPARSE(stat);
2961:   /* get matrix C non-zero entries C_nnz1 */
2962:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2963:   c->nz = (PetscInt)C_nnz1;
2964:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2965:                       mmdata->mmBufferSize / 1024));
2966:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2967:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2968:   Ccsr->values = new THRUSTARRAY(c->nz);
2969:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2970:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2971:   PetscCallCUSPARSE(stat);
2972:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2973:   PetscCallCUSPARSE(stat);
2974:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2975: #else
2976:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2977:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2978:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2979:   PetscCallCUSPARSE(stat);
2980:   c->nz = cnz;
2981:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2982:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2983:   Ccsr->values = new THRUSTARRAY(c->nz);
2984:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2986:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2987:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2988:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2989:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2990:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2991:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2992:   PetscCallCUSPARSE(stat);
2993: #endif
2994:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2995:   PetscCall(PetscLogGpuTimeEnd());
2996: finalizesym:
2997:   c->singlemalloc = PETSC_FALSE;
2998:   c->free_a       = PETSC_TRUE;
2999:   c->free_ij      = PETSC_TRUE;
3000:   PetscCall(PetscMalloc1(m + 1, &c->i));
3001:   PetscCall(PetscMalloc1(c->nz, &c->j));
3002:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3003:     PetscInt      *d_i = c->i;
3004:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3005:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3006:     ii = *Ccsr->row_offsets;
3007:     jj = *Ccsr->column_indices;
3008:     if (ciscompressed) d_i = c->compressedrow.i;
3009:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3010:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3011:   } else {
3012:     PetscInt *d_i = c->i;
3013:     if (ciscompressed) d_i = c->compressedrow.i;
3014:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3015:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3016:   }
3017:   if (ciscompressed) { /* need to expand host row offsets */
3018:     PetscInt r = 0;
3019:     c->i[0]    = 0;
3020:     for (k = 0; k < c->compressedrow.nrows; k++) {
3021:       const PetscInt next = c->compressedrow.rindex[k];
3022:       const PetscInt old  = c->compressedrow.i[k];
3023:       for (; r < next; r++) c->i[r + 1] = old;
3024:     }
3025:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3026:   }
3027:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3028:   PetscCall(PetscMalloc1(m, &c->ilen));
3029:   PetscCall(PetscMalloc1(m, &c->imax));
3030:   c->maxnz         = c->nz;
3031:   c->nonzerorowcnt = 0;
3032:   c->rmax          = 0;
3033:   for (k = 0; k < m; k++) {
3034:     const PetscInt nn = c->i[k + 1] - c->i[k];
3035:     c->ilen[k] = c->imax[k] = nn;
3036:     c->nonzerorowcnt += (PetscInt) !!nn;
3037:     c->rmax = PetscMax(c->rmax, nn);
3038:   }
3039:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3040:   PetscCall(PetscMalloc1(c->nz, &c->a));
3041:   Ccsr->num_entries = c->nz;

3043:   C->nonzerostate++;
3044:   PetscCall(PetscLayoutSetUp(C->rmap));
3045:   PetscCall(PetscLayoutSetUp(C->cmap));
3046:   Ccusp->nonzerostate = C->nonzerostate;
3047:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3048:   C->preallocated     = PETSC_TRUE;
3049:   C->assembled        = PETSC_FALSE;
3050:   C->was_assembled    = PETSC_FALSE;
3051:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3052:     mmdata->reusesym = PETSC_TRUE;
3053:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3054:   }
3055:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3056:   PetscFunctionReturn(PETSC_SUCCESS);
3057: }

3059: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3061: /* handles sparse or dense B */
3062: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3063: {
3064:   Mat_Product *product = mat->product;
3065:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3067:   PetscFunctionBegin;
3068:   MatCheckProduct(mat, 1);
3069:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3070:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3071:   if (product->type == MATPRODUCT_ABC) {
3072:     Ciscusp = PETSC_FALSE;
3073:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3074:   }
3075:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3076:     PetscBool usecpu = PETSC_FALSE;
3077:     switch (product->type) {
3078:     case MATPRODUCT_AB:
3079:       if (product->api_user) {
3080:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3081:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3082:         PetscOptionsEnd();
3083:       } else {
3084:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3085:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3086:         PetscOptionsEnd();
3087:       }
3088:       break;
3089:     case MATPRODUCT_AtB:
3090:       if (product->api_user) {
3091:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3092:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3093:         PetscOptionsEnd();
3094:       } else {
3095:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3096:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3097:         PetscOptionsEnd();
3098:       }
3099:       break;
3100:     case MATPRODUCT_PtAP:
3101:       if (product->api_user) {
3102:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3103:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3104:         PetscOptionsEnd();
3105:       } else {
3106:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3107:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3108:         PetscOptionsEnd();
3109:       }
3110:       break;
3111:     case MATPRODUCT_RARt:
3112:       if (product->api_user) {
3113:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3114:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3115:         PetscOptionsEnd();
3116:       } else {
3117:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3118:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3119:         PetscOptionsEnd();
3120:       }
3121:       break;
3122:     case MATPRODUCT_ABC:
3123:       if (product->api_user) {
3124:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3125:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3126:         PetscOptionsEnd();
3127:       } else {
3128:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3129:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3130:         PetscOptionsEnd();
3131:       }
3132:       break;
3133:     default:
3134:       break;
3135:     }
3136:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3137:   }
3138:   /* dispatch */
3139:   if (isdense) {
3140:     switch (product->type) {
3141:     case MATPRODUCT_AB:
3142:     case MATPRODUCT_AtB:
3143:     case MATPRODUCT_ABt:
3144:     case MATPRODUCT_PtAP:
3145:     case MATPRODUCT_RARt:
3146:       if (product->A->boundtocpu) {
3147:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3148:       } else {
3149:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3150:       }
3151:       break;
3152:     case MATPRODUCT_ABC:
3153:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3154:       break;
3155:     default:
3156:       break;
3157:     }
3158:   } else if (Biscusp && Ciscusp) {
3159:     switch (product->type) {
3160:     case MATPRODUCT_AB:
3161:     case MATPRODUCT_AtB:
3162:     case MATPRODUCT_ABt:
3163:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3164:       break;
3165:     case MATPRODUCT_PtAP:
3166:     case MATPRODUCT_RARt:
3167:     case MATPRODUCT_ABC:
3168:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3169:       break;
3170:     default:
3171:       break;
3172:     }
3173:   } else { /* fallback for AIJ */
3174:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3175:   }
3176:   PetscFunctionReturn(PETSC_SUCCESS);
3177: }

3179: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3180: {
3181:   PetscFunctionBegin;
3182:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3183:   PetscFunctionReturn(PETSC_SUCCESS);
3184: }

3186: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3187: {
3188:   PetscFunctionBegin;
3189:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3190:   PetscFunctionReturn(PETSC_SUCCESS);
3191: }

3193: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3194: {
3195:   PetscFunctionBegin;
3196:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3197:   PetscFunctionReturn(PETSC_SUCCESS);
3198: }

3200: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3201: {
3202:   PetscFunctionBegin;
3203:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3204:   PetscFunctionReturn(PETSC_SUCCESS);
3205: }

3207: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3208: {
3209:   PetscFunctionBegin;
3210:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3211:   PetscFunctionReturn(PETSC_SUCCESS);
3212: }

3214: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3215: {
3216:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3217:   if (i < n) y[idx[i]] += x[i];
3218: }

3220: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3221: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3222: {
3223:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3224:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3225:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3226:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3227:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3228:   PetscBool                     compressed;
3229: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3230:   PetscInt nx, ny;
3231: #endif

3233:   PetscFunctionBegin;
3234:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3235:   if (!a->nz) {
3236:     if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz));
3237:     else PetscCall(VecSeq_CUDA::set(zz, 0));
3238:     PetscFunctionReturn(PETSC_SUCCESS);
3239:   }
3240:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3241:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3242:   if (!trans) {
3243:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3244:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3245:   } else {
3246:     if (herm || !A->form_explicit_transpose) {
3247:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3248:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3249:     } else {
3250:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3251:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3252:     }
3253:   }
3254:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3255:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3257:   try {
3258:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3259:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3260:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3262:     PetscCall(PetscLogGpuTimeBegin());
3263:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3264:       /* z = A x + beta y.
3265:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3266:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3267:       */
3268:       xptr = xarray;
3269:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3270:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3271: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3272:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3273:           allocated to accommodate different uses. So we get the length info directly from mat.
3274:        */
3275:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3276:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3277:         nx             = mat->num_cols;
3278:         ny             = mat->num_rows;
3279:       }
3280: #endif
3281:     } else {
3282:       /* z = A^T x + beta y
3283:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3284:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3285:        */
3286:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3287:       dptr = zarray;
3288:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3289:       if (compressed) { /* Scatter x to work vector */
3290:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3292:         thrust::for_each(
3293: #if PetscDefined(HAVE_THRUST_ASYNC)
3294:           thrust::cuda::par.on(PetscDefaultCudaStream),
3295: #endif
3296:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3297:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3298:       }
3299: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3300:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302:         nx             = mat->num_rows;
3303:         ny             = mat->num_cols;
3304:       }
3305: #endif
3306:     }

3308:     /* csr_spmv does y = alpha op(A) x + beta y */
3309:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3310: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3311:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3312:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3313:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3314:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3315:         PetscCallCUSPARSE(
3316:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3317:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));

3319:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3320:       } else {
3321:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3322:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3323:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3324:       }

3326:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3327:                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3328: #else
3329:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3330:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3331: #endif
3332:     } else {
3333:       if (cusparsestruct->nrows) {
3334: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3335:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3336: #else
3337:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3338:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3339: #endif
3340:       }
3341:     }
3342:     PetscCall(PetscLogGpuTimeEnd());

3344:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3345:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3346:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3347:           PetscCall(VecSeq_CUDA::copy(yy, zz));      /* zz = yy */
3348:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3349:           PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3350:         }
3351:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3352:         PetscCall(VecSeq_CUDA::set(zz, 0));
3353:       }

3355:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3356:       if (compressed) {
3357:         PetscCall(PetscLogGpuTimeBegin());
3358:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3359:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3360:            prevent that. So I just add a ScatterAdd kernel.
3361:          */
3362: #if 0
3363:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3364:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3365:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3366:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3367:                          VecCUDAPlusEquals());
3368: #else
3369:         PetscInt n = matstruct->cprowIndices->size();
3370:         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3371: #endif
3372:         PetscCall(PetscLogGpuTimeEnd());
3373:       }
3374:     } else {
3375:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3376:     }
3377:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3378:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3379:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3380:   } catch (char *ex) {
3381:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3382:   }
3383:   if (yy) {
3384:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3385:   } else {
3386:     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3387:   }
3388:   PetscFunctionReturn(PETSC_SUCCESS);
3389: }

3391: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3392: {
3393:   PetscFunctionBegin;
3394:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3395:   PetscFunctionReturn(PETSC_SUCCESS);
3396: }

3398: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3399: {
3400:   PetscObjectState    onnz = A->nonzerostate;
3401:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

3403:   PetscFunctionBegin;
3404:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3405:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3406:     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3407:     PetscCallCUDA(cudaFree(cusp->deviceMat));
3408:     cusp->deviceMat = NULL;
3409:   }
3410:   PetscFunctionReturn(PETSC_SUCCESS);
3411: }

3413: /*@
3414:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3415:    (the default parallel PETSc format). This matrix will ultimately pushed down
3416:    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3417:    assembly performance the user should preallocate the matrix storage by setting
3418:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3419:    performance during matrix assembly can be increased by more than a factor of 50.

3421:    Collective

3423:    Input Parameters:
3424: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3425: .  m - number of rows
3426: .  n - number of columns
3427: .  nz - number of nonzeros per row (same for all rows)
3428: -  nnz - array containing the number of nonzeros in the various rows
3429:          (possibly different for each row) or `NULL`

3431:    Output Parameter:
3432: .  A - the matrix

3434:    Level: intermediate

3436:    Notes:
3437:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3438:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3439:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3441:    If `nnz` is given then `nz` is ignored

3443:    The AIJ format, also called
3444:    compressed row storage, is fully compatible with standard Fortran
3445:    storage.  That is, the stored row and column indices can begin at
3446:    either one (as in Fortran) or zero.  See the users' manual for details.

3448:    Specify the preallocated storage with either nz or nnz (not both).
3449:    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3450:    allocation.  For large problems you MUST preallocate memory or you
3451:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3453:    By default, this format uses inodes (identical nodes) when possible, to
3454:    improve numerical efficiency of matrix-vector products and solves. We
3455:    search for consecutive rows with the same nonzero structure, thereby
3456:    reusing matrix information to achieve increased efficiency.

3458: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3459: @*/
3460: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3461: {
3462:   PetscFunctionBegin;
3463:   PetscCall(MatCreate(comm, A));
3464:   PetscCall(MatSetSizes(*A, m, n, m, n));
3465:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3466:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3467:   PetscFunctionReturn(PETSC_SUCCESS);
3468: }

3470: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3471: {
3472:   PetscFunctionBegin;
3473:   if (A->factortype == MAT_FACTOR_NONE) {
3474:     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
3475:   } else {
3476:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3477:   }
3478:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3479:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3480:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3481:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3482:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3483:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3484:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3485:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3486:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3487:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3488:   PetscCall(MatDestroy_SeqAIJ(A));
3489:   PetscFunctionReturn(PETSC_SUCCESS);
3490: }

3492: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3493: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3494: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3495: {
3496:   PetscFunctionBegin;
3497:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3498:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3499:   PetscFunctionReturn(PETSC_SUCCESS);
3500: }

3502: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3503: {
3504:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3505:   Mat_SeqAIJCUSPARSE *cy;
3506:   Mat_SeqAIJCUSPARSE *cx;
3507:   PetscScalar        *ay;
3508:   const PetscScalar  *ax;
3509:   CsrMatrix          *csry, *csrx;

3511:   PetscFunctionBegin;
3512:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3513:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3514:   if (X->ops->axpy != Y->ops->axpy) {
3515:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3516:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3517:     PetscFunctionReturn(PETSC_SUCCESS);
3518:   }
3519:   /* if we are here, it means both matrices are bound to GPU */
3520:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3521:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3522:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3523:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3524:   csry = (CsrMatrix *)cy->mat->mat;
3525:   csrx = (CsrMatrix *)cx->mat->mat;
3526:   /* see if we can turn this into a cublas axpy */
3527:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3528:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3529:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3530:     if (eq) str = SAME_NONZERO_PATTERN;
3531:   }
3532:   /* spgeam is buggy with one column */
3533:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3535:   if (str == SUBSET_NONZERO_PATTERN) {
3536:     PetscScalar b = 1.0;
3537: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3538:     size_t bufferSize;
3539:     void  *buffer;
3540: #endif

3542:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3543:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3544:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3545: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3546:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3547:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3548:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3549:     PetscCall(PetscLogGpuTimeBegin());
3550:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3551:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3552:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3553:     PetscCall(PetscLogGpuTimeEnd());
3554:     PetscCallCUDA(cudaFree(buffer));
3555: #else
3556:     PetscCall(PetscLogGpuTimeBegin());
3557:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3558:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3559:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3560:     PetscCall(PetscLogGpuTimeEnd());
3561: #endif
3562:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3563:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3564:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3565:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3566:   } else if (str == SAME_NONZERO_PATTERN) {
3567:     cublasHandle_t cublasv2handle;
3568:     PetscBLASInt   one = 1, bnz = 1;

3570:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3571:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3572:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3573:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3574:     PetscCall(PetscLogGpuTimeBegin());
3575:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3576:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3577:     PetscCall(PetscLogGpuTimeEnd());
3578:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3579:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3580:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3581:   } else {
3582:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3583:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3584:   }
3585:   PetscFunctionReturn(PETSC_SUCCESS);
3586: }

3588: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3589: {
3590:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3591:   PetscScalar   *ay;
3592:   cublasHandle_t cublasv2handle;
3593:   PetscBLASInt   one = 1, bnz = 1;

3595:   PetscFunctionBegin;
3596:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3597:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3598:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3599:   PetscCall(PetscLogGpuTimeBegin());
3600:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3601:   PetscCall(PetscLogGpuFlops(bnz));
3602:   PetscCall(PetscLogGpuTimeEnd());
3603:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3604:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3605:   PetscFunctionReturn(PETSC_SUCCESS);
3606: }

3608: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3609: {
3610:   PetscBool   both = PETSC_FALSE;
3611:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3613:   PetscFunctionBegin;
3614:   if (A->factortype == MAT_FACTOR_NONE) {
3615:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3616:     if (spptr->mat) {
3617:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3618:       if (matrix->values) {
3619:         both = PETSC_TRUE;
3620:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3621:       }
3622:     }
3623:     if (spptr->matTranspose) {
3624:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3625:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3626:     }
3627:   }
3628:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3629:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3630:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3631:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3632:   PetscFunctionReturn(PETSC_SUCCESS);
3633: }

3635: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3636: {
3637:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3639:   PetscFunctionBegin;
3640:   if (A->factortype != MAT_FACTOR_NONE) {
3641:     A->boundtocpu = flg;
3642:     PetscFunctionReturn(PETSC_SUCCESS);
3643:   }
3644:   if (flg) {
3645:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

3647:     A->ops->scale                     = MatScale_SeqAIJ;
3648:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3649:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3650:     A->ops->mult                      = MatMult_SeqAIJ;
3651:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3652:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3653:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3654:     A->ops->multhermitiantranspose    = NULL;
3655:     A->ops->multhermitiantransposeadd = NULL;
3656:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3657:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3658:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3659:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3660:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3661:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3662:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3663:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3664:   } else {
3665:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3666:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3667:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3668:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3669:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3670:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3671:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3672:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3673:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3674:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3675:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3676:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3677:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3678:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3679:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3680:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3681:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

3683:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3684:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3685:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3686:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
3687:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
3688:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3689:   }
3690:   A->boundtocpu = flg;
3691:   if (flg && a->inode.size) {
3692:     a->inode.use = PETSC_TRUE;
3693:   } else {
3694:     a->inode.use = PETSC_FALSE;
3695:   }
3696:   PetscFunctionReturn(PETSC_SUCCESS);
3697: }

3699: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3700: {
3701:   Mat B;

3703:   PetscFunctionBegin;
3704:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3705:   if (reuse == MAT_INITIAL_MATRIX) {
3706:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3707:   } else if (reuse == MAT_REUSE_MATRIX) {
3708:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3709:   }
3710:   B = *newmat;

3712:   PetscCall(PetscFree(B->defaultvectype));
3713:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

3715:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3716:     if (B->factortype == MAT_FACTOR_NONE) {
3717:       Mat_SeqAIJCUSPARSE *spptr;
3718:       PetscCall(PetscNew(&spptr));
3719:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3720:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3721:       spptr->format = MAT_CUSPARSE_CSR;
3722: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3723:   #if CUSPARSE_VERSION > 11301
3724:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3725:   #else
3726:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3727:   #endif
3728:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3729:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3730: #endif
3731:       B->spptr = spptr;
3732:     } else {
3733:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3735:       PetscCall(PetscNew(&spptr));
3736:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3737:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3738:       B->spptr = spptr;
3739:     }
3740:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3741:   }
3742:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3743:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3744:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3745:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3746:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3747:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3749:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
3750:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
3751:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3752: #if defined(PETSC_HAVE_HYPRE)
3753:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3754: #endif
3755:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3756:   PetscFunctionReturn(PETSC_SUCCESS);
3757: }

3759: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3760: {
3761:   PetscFunctionBegin;
3762:   PetscCall(MatCreate_SeqAIJ(B));
3763:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
3764:   PetscFunctionReturn(PETSC_SUCCESS);
3765: }

3767: /*MC
3768:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3770:    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3771:    CSR, ELL, or Hybrid format.
3772:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

3774:    Options Database Keys:
3775: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3776: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3777:                                       Other options include ell (ellpack) or hyb (hybrid).
3778: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3779: -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU

3781:   Level: beginner

3783: .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3784: M*/

3786: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);

3788: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3789: {
3790:   PetscFunctionBegin;
3791:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
3792:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
3793:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
3794:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
3795:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));

3797:   PetscFunctionReturn(PETSC_SUCCESS);
3798: }

3800: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3801: {
3802:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;

3804:   PetscFunctionBegin;
3805:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3806:   delete cusp->cooPerm;
3807:   delete cusp->cooPerm_a;
3808:   cusp->cooPerm   = NULL;
3809:   cusp->cooPerm_a = NULL;
3810:   if (cusp->use_extended_coo) {
3811:     PetscCallCUDA(cudaFree(cusp->jmap_d));
3812:     PetscCallCUDA(cudaFree(cusp->perm_d));
3813:   }
3814:   cusp->use_extended_coo = PETSC_FALSE;
3815:   PetscFunctionReturn(PETSC_SUCCESS);
3816: }

3818: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3819: {
3820:   PetscFunctionBegin;
3821:   if (*cusparsestruct) {
3822:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
3823:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
3824:     delete (*cusparsestruct)->workVector;
3825:     delete (*cusparsestruct)->rowoffsets_gpu;
3826:     delete (*cusparsestruct)->cooPerm;
3827:     delete (*cusparsestruct)->cooPerm_a;
3828:     delete (*cusparsestruct)->csr2csc_i;
3829:     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3830:     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3831:     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3832:     PetscCall(PetscFree(*cusparsestruct));
3833:   }
3834:   PetscFunctionReturn(PETSC_SUCCESS);
3835: }

3837: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3838: {
3839:   PetscFunctionBegin;
3840:   if (*mat) {
3841:     delete (*mat)->values;
3842:     delete (*mat)->column_indices;
3843:     delete (*mat)->row_offsets;
3844:     delete *mat;
3845:     *mat = 0;
3846:   }
3847:   PetscFunctionReturn(PETSC_SUCCESS);
3848: }

3850: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3851: {
3852:   PetscFunctionBegin;
3853:   if (*trifactor) {
3854:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3855:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3856:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3857:     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3858:     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3859: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3860:     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3861: #endif
3862:     PetscCall(PetscFree(*trifactor));
3863:   }
3864:   PetscFunctionReturn(PETSC_SUCCESS);
3865: }

3867: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3868: {
3869:   CsrMatrix *mat;

3871:   PetscFunctionBegin;
3872:   if (*matstruct) {
3873:     if ((*matstruct)->mat) {
3874:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877: #else
3878:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3879:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880: #endif
3881:       } else {
3882:         mat = (CsrMatrix *)(*matstruct)->mat;
3883:         PetscCall(CsrMatrix_Destroy(&mat));
3884:       }
3885:     }
3886:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3887:     delete (*matstruct)->cprowIndices;
3888:     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3889:     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3890:     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));

3892: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3894:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895:     for (int i = 0; i < 3; i++) {
3896:       if (mdata->cuSpMV[i].initialized) {
3897:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3898:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3899:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900:       }
3901:     }
3902: #endif
3903:     delete *matstruct;
3904:     *matstruct = NULL;
3905:   }
3906:   PetscFunctionReturn(PETSC_SUCCESS);
3907: }

3909: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3910: {
3911:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

3913:   PetscFunctionBegin;
3914:   if (fs) {
3915:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919:     delete fs->rpermIndices;
3920:     delete fs->cpermIndices;
3921:     delete fs->workVector;
3922:     fs->rpermIndices = NULL;
3923:     fs->cpermIndices = NULL;
3924:     fs->workVector   = NULL;
3925:     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926:     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927:     fs->init_dev_prop = PETSC_FALSE;
3928: #if CUSPARSE_VERSION >= 11500
3929:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930:     PetscCallCUDA(cudaFree(fs->csrColIdx));
3931:     PetscCallCUDA(cudaFree(fs->csrVal));
3932:     PetscCallCUDA(cudaFree(fs->X));
3933:     PetscCallCUDA(cudaFree(fs->Y));
3934:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3937:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));

3951:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3952:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953: #endif
3954:   }
3955:   PetscFunctionReturn(PETSC_SUCCESS);
3956: }

3958: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3959: {
3960:   PetscFunctionBegin;
3961:   if (*trifactors) {
3962:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3964:     PetscCall(PetscFree(*trifactors));
3965:   }
3966:   PetscFunctionReturn(PETSC_SUCCESS);
3967: }

3969: struct IJCompare {
3970:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3971:   {
3972:     if (t1.get<0>() < t2.get<0>()) return true;
3973:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3974:     return false;
3975:   }
3976: };

3978: struct IJEqual {
3979:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3980:   {
3981:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3982:     return true;
3983:   }
3984: };

3986: struct IJDiff {
3987:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3988: };

3990: struct IJSum {
3991:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3992: };

3994: #include <thrust/iterator/discard_iterator.h>
3995: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3996: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3997: {
3998:   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3999:   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4000:   THRUSTARRAY                          *cooPerm_v = NULL;
4001:   thrust::device_ptr<const PetscScalar> d_v;
4002:   CsrMatrix                            *matrix;
4003:   PetscInt                              n;

4005:   PetscFunctionBegin;
4006:   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
4007:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
4008:   if (!cusp->cooPerm) {
4009:     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
4010:     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
4011:     PetscFunctionReturn(PETSC_SUCCESS);
4012:   }
4013:   matrix = (CsrMatrix *)cusp->mat->mat;
4014:   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4015:   if (!v) {
4016:     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4017:     goto finalize;
4018:   }
4019:   n = cusp->cooPerm->size();
4020:   if (isCudaMem(v)) {
4021:     d_v = thrust::device_pointer_cast(v);
4022:   } else {
4023:     cooPerm_v = new THRUSTARRAY(n);
4024:     cooPerm_v->assign(v, v + n);
4025:     d_v = cooPerm_v->data();
4026:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4027:   }
4028:   PetscCall(PetscLogGpuTimeBegin());
4029:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4030:     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4031:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4032:       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4033:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4034:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4035:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4036:       */
4037:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4038:       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4039:       delete cooPerm_w;
4040:     } else {
4041:       /* all nonzeros in d_v[] are unique entries */
4042:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4043:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4044:       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4045:     }
4046:   } else {
4047:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4048:       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4049:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4050:     } else {
4051:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
4052:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4053:       thrust::for_each(zibit, zieit, VecCUDAEquals());
4054:     }
4055:   }
4056:   PetscCall(PetscLogGpuTimeEnd());
4057: finalize:
4058:   delete cooPerm_v;
4059:   A->offloadmask = PETSC_OFFLOAD_GPU;
4060:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4061:   /* shorter version of MatAssemblyEnd_SeqAIJ */
4062:   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
4063:   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
4064:   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4065:   a->reallocs = 0;
4066:   A->info.mallocs += 0;
4067:   A->info.nz_unneeded = 0;
4068:   A->assembled = A->was_assembled = PETSC_TRUE;
4069:   A->num_ass++;
4070:   PetscFunctionReturn(PETSC_SUCCESS);
4071: }

4073: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4074: {
4075:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4077:   PetscFunctionBegin;
4078:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4079:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4080:   if (destroy) {
4081:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4082:     delete cusp->csr2csc_i;
4083:     cusp->csr2csc_i = NULL;
4084:   }
4085:   A->transupdated = PETSC_FALSE;
4086:   PetscFunctionReturn(PETSC_SUCCESS);
4087: }

4089: #include <thrust/binary_search.h>
4090: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4091: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4092: {
4093:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4094:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4095:   PetscInt            cooPerm_n, nzr = 0;

4097:   PetscFunctionBegin;
4098:   PetscCall(PetscLayoutSetUp(A->rmap));
4099:   PetscCall(PetscLayoutSetUp(A->cmap));
4100:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4101:   if (n != cooPerm_n) {
4102:     delete cusp->cooPerm;
4103:     delete cusp->cooPerm_a;
4104:     cusp->cooPerm   = NULL;
4105:     cusp->cooPerm_a = NULL;
4106:   }
4107:   if (n) {
4108:     thrust::device_ptr<PetscInt> d_i, d_j;
4109:     PetscInt                    *d_raw_i, *d_raw_j;
4110:     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4111:     PetscMemType                 imtype, jmtype;

4113:     PetscCall(PetscGetMemType(coo_i, &imtype));
4114:     if (PetscMemTypeHost(imtype)) {
4115:       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4116:       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4117:       d_i        = thrust::device_pointer_cast(d_raw_i);
4118:       free_raw_i = PETSC_TRUE;
4119:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4120:     } else {
4121:       d_i = thrust::device_pointer_cast(coo_i);
4122:     }

4124:     PetscCall(PetscGetMemType(coo_j, &jmtype));
4125:     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4126:       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4127:       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4128:       d_j        = thrust::device_pointer_cast(d_raw_j);
4129:       free_raw_j = PETSC_TRUE;
4130:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4131:     } else {
4132:       d_j = thrust::device_pointer_cast(coo_j);
4133:     }

4135:     THRUSTINTARRAY ii(A->rmap->n);

4137:     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4138:     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);

4140:     /* Ex.
4141:       n = 6
4142:       coo_i = [3,3,1,4,1,4]
4143:       coo_j = [3,2,2,5,2,6]
4144:     */
4145:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4146:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));

4148:     PetscCall(PetscLogGpuTimeBegin());
4149:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4150:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4151:     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4152:     THRUSTINTARRAY w(d_j, d_j + n);

4154:     /*
4155:       d_i     = [1,1,3,3,4,4]
4156:       d_j     = [2,2,2,3,5,6]
4157:       cooPerm = [2,4,1,0,3,5]
4158:     */
4159:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

4161:     /*
4162:       d_i     = [1,3,3,4,4,x]
4163:                             ^ekey
4164:       d_j     = [2,2,3,5,6,x]
4165:                            ^nekye
4166:     */
4167:     if (nekey == ekey) { /* all entries are unique */
4168:       delete cusp->cooPerm_a;
4169:       cusp->cooPerm_a = NULL;
4170:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4171:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4172:       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4173:       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4174:       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4175:       w[0]                  = 0;
4176:       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4177:       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4178:     }
4179:     thrust::counting_iterator<PetscInt> search_begin(0);
4180:     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4181:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4182:                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4183:     PetscCall(PetscLogGpuTimeEnd());

4185:     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
4186:     a->singlemalloc = PETSC_FALSE;
4187:     a->free_a       = PETSC_TRUE;
4188:     a->free_ij      = PETSC_TRUE;
4189:     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4190:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4191:     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4192:     a->nz = a->maxnz = a->i[A->rmap->n];
4193:     a->rmax          = 0;
4194:     PetscCall(PetscMalloc1(a->nz, &a->a));
4195:     PetscCall(PetscMalloc1(a->nz, &a->j));
4196:     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4197:     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
4198:     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
4199:     for (PetscInt i = 0; i < A->rmap->n; i++) {
4200:       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4201:       nzr += (PetscInt) !!(nnzr);
4202:       a->ilen[i] = a->imax[i] = nnzr;
4203:       a->rmax                 = PetscMax(a->rmax, nnzr);
4204:     }
4205:     a->nonzerorowcnt = nzr;
4206:     A->preallocated  = PETSC_TRUE;
4207:     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
4208:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4209:     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4210:     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4211:   } else {
4212:     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
4213:   }
4214:   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));

4216:   /* We want to allocate the CUSPARSE struct for matvec now.
4217:      The code is so convoluted now that I prefer to copy zeros */
4218:   PetscCall(PetscArrayzero(a->a, a->nz));
4219:   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
4220:   A->offloadmask = PETSC_OFFLOAD_CPU;
4221:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4222:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
4223:   PetscFunctionReturn(PETSC_SUCCESS);
4224: }

4226: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4227: {
4228:   Mat_SeqAIJ         *seq;
4229:   Mat_SeqAIJCUSPARSE *dev;
4230:   PetscBool           coo_basic = PETSC_TRUE;
4231:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;

4233:   PetscFunctionBegin;
4234:   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4235:   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4236:   if (coo_i) {
4237:     PetscCall(PetscGetMemType(coo_i, &mtype));
4238:     if (PetscMemTypeHost(mtype)) {
4239:       for (PetscCount k = 0; k < coo_n; k++) {
4240:         if (coo_i[k] < 0 || coo_j[k] < 0) {
4241:           coo_basic = PETSC_FALSE;
4242:           break;
4243:         }
4244:       }
4245:     }
4246:   }

4248:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4249:     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4250:   } else {
4251:     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4252:     mat->offloadmask = PETSC_OFFLOAD_CPU;
4253:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4254:     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4255:     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4256:     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4257:     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4258:     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4259:     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4260:     dev->use_extended_coo = PETSC_TRUE;
4261:   }
4262:   PetscFunctionReturn(PETSC_SUCCESS);
4263: }

4265: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4266: {
4267:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4268:   const PetscCount grid_size = gridDim.x * blockDim.x;
4269:   for (; i < nnz; i += grid_size) {
4270:     PetscScalar sum = 0.0;
4271:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4272:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4273:   }
4274: }

4276: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4277: {
4278:   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4279:   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4280:   PetscCount          Annz = seq->nz;
4281:   PetscMemType        memtype;
4282:   const PetscScalar  *v1 = v;
4283:   PetscScalar        *Aa;

4285:   PetscFunctionBegin;
4286:   if (dev->use_extended_coo) {
4287:     PetscCall(PetscGetMemType(v, &memtype));
4288:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4289:       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4290:       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4291:     }

4293:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4294:     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4296:     if (Annz) {
4297:       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4298:       PetscCallCUDA(cudaPeekAtLastError());
4299:     }

4301:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4302:     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4304:     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4305:   } else {
4306:     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4307:   }
4308:   PetscFunctionReturn(PETSC_SUCCESS);
4309: }

4311: /*@C
4312:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4314:    Not Collective

4316:     Input Parameters:
4317: +   A - the matrix
4318: -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4320:     Output Parameters:
4321: +   ia - the CSR row pointers
4322: -   ja - the CSR column indices

4324:     Level: developer

4326:     Note:
4327:       When compressed is true, the CSR structure does not contain empty rows

4329: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4330: @*/
4331: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4332: {
4333:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4334:   CsrMatrix          *csr;
4335:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4337:   PetscFunctionBegin;
4339:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4340:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4341:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4342:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4343:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4344:   csr = (CsrMatrix *)cusp->mat->mat;
4345:   if (i) {
4346:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4347:       if (!cusp->rowoffsets_gpu) {
4348:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4349:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4350:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4351:       }
4352:       *i = cusp->rowoffsets_gpu->data().get();
4353:     } else *i = csr->row_offsets->data().get();
4354:   }
4355:   if (j) *j = csr->column_indices->data().get();
4356:   PetscFunctionReturn(PETSC_SUCCESS);
4357: }

4359: /*@C
4360:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4362:    Not Collective

4364:     Input Parameters:
4365: +   A - the matrix
4366: .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4367: .   ia - the CSR row pointers
4368: -   ja - the CSR column indices

4370:     Level: developer

4372: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4373: @*/
4374: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4375: {
4376:   PetscFunctionBegin;
4378:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4379:   if (i) *i = NULL;
4380:   if (j) *j = NULL;
4381:   PetscFunctionReturn(PETSC_SUCCESS);
4382: }

4384: /*@C
4385:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4387:    Not Collective

4389:    Input Parameter:
4390: .   A - a `MATSEQAIJCUSPARSE` matrix

4392:    Output Parameter:
4393: .   a - pointer to the device data

4395:    Level: developer

4397:    Note:
4398:    May trigger host-device copies if up-to-date matrix data is on host

4400: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4401: @*/
4402: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4403: {
4404:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4405:   CsrMatrix          *csr;

4407:   PetscFunctionBegin;
4410:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4411:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4412:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4414:   csr = (CsrMatrix *)cusp->mat->mat;
4415:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4416:   *a = csr->values->data().get();
4417:   PetscFunctionReturn(PETSC_SUCCESS);
4418: }

4420: /*@C
4421:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4423:    Not Collective

4425:    Input Parameters:
4426: +   A - a `MATSEQAIJCUSPARSE` matrix
4427: -   a - pointer to the device data

4429:    Level: developer

4431: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4432: @*/
4433: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4434: {
4435:   PetscFunctionBegin;
4438:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4439:   *a = NULL;
4440:   PetscFunctionReturn(PETSC_SUCCESS);
4441: }

4443: /*@C
4444:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4446:    Not Collective

4448:    Input Parameter:
4449: .   A - a `MATSEQAIJCUSPARSE` matrix

4451:    Output Parameter:
4452: .   a - pointer to the device data

4454:    Level: developer

4456:    Note:
4457:    May trigger host-device copies if up-to-date matrix data is on host

4459: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4460: @*/
4461: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4462: {
4463:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4464:   CsrMatrix          *csr;

4466:   PetscFunctionBegin;
4469:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473:   csr = (CsrMatrix *)cusp->mat->mat;
4474:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4475:   *a             = csr->values->data().get();
4476:   A->offloadmask = PETSC_OFFLOAD_GPU;
4477:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4478:   PetscFunctionReturn(PETSC_SUCCESS);
4479: }
4480: /*@C
4481:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4483:    Not Collective

4485:    Input Parameters:
4486: +   A - a `MATSEQAIJCUSPARSE` matrix
4487: -   a - pointer to the device data

4489:    Level: developer

4491: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4492: @*/
4493: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4494: {
4495:   PetscFunctionBegin;
4498:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4499:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4500:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4501:   *a = NULL;
4502:   PetscFunctionReturn(PETSC_SUCCESS);
4503: }

4505: /*@C
4506:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4508:    Not Collective

4510:    Input Parameter:
4511: .   A - a `MATSEQAIJCUSPARSE` matrix

4513:    Output Parameter:
4514: .   a - pointer to the device data

4516:    Level: developer

4518:    Note:
4519:    Does not trigger host-device copies and flags data validity on the GPU

4521: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4522: @*/
4523: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4524: {
4525:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4526:   CsrMatrix          *csr;

4528:   PetscFunctionBegin;
4531:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4533:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534:   csr = (CsrMatrix *)cusp->mat->mat;
4535:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536:   *a             = csr->values->data().get();
4537:   A->offloadmask = PETSC_OFFLOAD_GPU;
4538:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4539:   PetscFunctionReturn(PETSC_SUCCESS);
4540: }

4542: /*@C
4543:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4545:    Not Collective

4547:    Input Parameters:
4548: +   A - a `MATSEQAIJCUSPARSE` matrix
4549: -   a - pointer to the device data

4551:    Level: developer

4553: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4554: @*/
4555: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4556: {
4557:   PetscFunctionBegin;
4560:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4562:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4563:   *a = NULL;
4564:   PetscFunctionReturn(PETSC_SUCCESS);
4565: }

4567: struct IJCompare4 {
4568:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4569:   {
4570:     if (t1.get<0>() < t2.get<0>()) return true;
4571:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4572:     return false;
4573:   }
4574: };

4576: struct Shift {
4577:   int _shift;

4579:   Shift(int shift) : _shift(shift) { }
4580:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4581: };

4583: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4584: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4585: {
4586:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4590:   PetscInt                      Annz, Bnnz;
4591:   cusparseStatus_t              stat;
4592:   PetscInt                      i, m, n, zero = 0;

4594:   PetscFunctionBegin;
4598:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4600:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4601:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604:   if (reuse == MAT_INITIAL_MATRIX) {
4605:     m = A->rmap->n;
4606:     n = A->cmap->n + B->cmap->n;
4607:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4608:     PetscCall(MatSetSizes(*C, m, n, m, n));
4609:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610:     c                       = (Mat_SeqAIJ *)(*C)->data;
4611:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4613:     Ccsr                    = new CsrMatrix;
4614:     Cmat->cprowIndices      = NULL;
4615:     c->compressedrow.use    = PETSC_FALSE;
4616:     c->compressedrow.nrows  = 0;
4617:     c->compressedrow.i      = NULL;
4618:     c->compressedrow.rindex = NULL;
4619:     Ccusp->workVector       = NULL;
4620:     Ccusp->nrows            = m;
4621:     Ccusp->mat              = Cmat;
4622:     Ccusp->mat->mat         = Ccsr;
4623:     Ccsr->num_rows          = m;
4624:     Ccsr->num_cols          = n;
4625:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4626:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4627:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4628:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4629:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4630:     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4631:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4632:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4633:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4634:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4635:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4636:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4637:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4639:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4640:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4641:     Annz                 = (PetscInt)Acsr->column_indices->size();
4642:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4643:     c->nz                = Annz + Bnnz;
4644:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4645:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646:     Ccsr->values         = new THRUSTARRAY(c->nz);
4647:     Ccsr->num_entries    = c->nz;
4648:     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4649:     if (c->nz) {
4650:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4651:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4652:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4653:       THRUSTINTARRAY32 *Aroff, *Broff;

4655:       if (a->compressedrow.use) { /* need full row offset */
4656:         if (!Acusp->rowoffsets_gpu) {
4657:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4659:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660:         }
4661:         Aroff = Acusp->rowoffsets_gpu;
4662:       } else Aroff = Acsr->row_offsets;
4663:       if (b->compressedrow.use) { /* need full row offset */
4664:         if (!Bcusp->rowoffsets_gpu) {
4665:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4667:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668:         }
4669:         Broff = Bcusp->rowoffsets_gpu;
4670:       } else Broff = Bcsr->row_offsets;
4671:       PetscCall(PetscLogGpuTimeBegin());
4672:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4673:       PetscCallCUSPARSE(stat);
4674:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4675:       PetscCallCUSPARSE(stat);
4676:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4677:       auto Aperm = thrust::make_constant_iterator(1);
4678:       auto Bperm = thrust::make_constant_iterator(0);
4679: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4682: #else
4683:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4684:       auto Bcib = Bcsr->column_indices->begin();
4685:       auto Bcie = Bcsr->column_indices->end();
4686:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4687: #endif
4688:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4689:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4690:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4691:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4692:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4693:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694:       auto p1    = Ccusp->cooPerm->begin();
4695:       auto p2    = Ccusp->cooPerm->begin();
4696:       thrust::advance(p2, Annz);
4697:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4698: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4699:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4700: #endif
4701:       auto cci = thrust::make_counting_iterator(zero);
4702:       auto cce = thrust::make_counting_iterator(c->nz);
4703: #if 0 //Errors on SUMMIT cuda 11.1.0
4704:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4705: #else
4706:       auto pred = thrust::identity<int>();
4707:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4709: #endif
4710:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4711:       PetscCallCUSPARSE(stat);
4712:       PetscCall(PetscLogGpuTimeEnd());
4713:       delete wPerm;
4714:       delete Acoo;
4715:       delete Bcoo;
4716:       delete Ccoo;
4717: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4718:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4719:       PetscCallCUSPARSE(stat);
4720: #endif
4721:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4722:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4723:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726:         CsrMatrix                    *CcsrT = new CsrMatrix;
4727:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4730:         (*C)->form_explicit_transpose = PETSC_TRUE;
4731:         (*C)->transupdated            = PETSC_TRUE;
4732:         Ccusp->rowoffsets_gpu         = NULL;
4733:         CmatT->cprowIndices           = NULL;
4734:         CmatT->mat                    = CcsrT;
4735:         CcsrT->num_rows               = n;
4736:         CcsrT->num_cols               = m;
4737:         CcsrT->num_entries            = c->nz;

4739:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4740:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741:         CcsrT->values         = new THRUSTARRAY(c->nz);

4743:         PetscCall(PetscLogGpuTimeBegin());
4744:         auto rT = CcsrT->row_offsets->begin();
4745:         if (AT) {
4746:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747:           thrust::advance(rT, -1);
4748:         }
4749:         if (BT) {
4750:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752:           thrust::copy(titb, tite, rT);
4753:         }
4754:         auto cT = CcsrT->column_indices->begin();
4755:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757:         auto vT = CcsrT->values->begin();
4758:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4760:         PetscCall(PetscLogGpuTimeEnd());

4762:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4763:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4764:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4765:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4766:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4767:         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4768:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4769:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4770:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4772:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4773:         PetscCallCUSPARSE(stat);
4774: #endif
4775:         Ccusp->matTranspose = CmatT;
4776:       }
4777:     }

4779:     c->singlemalloc = PETSC_FALSE;
4780:     c->free_a       = PETSC_TRUE;
4781:     c->free_ij      = PETSC_TRUE;
4782:     PetscCall(PetscMalloc1(m + 1, &c->i));
4783:     PetscCall(PetscMalloc1(c->nz, &c->j));
4784:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787:       ii = *Ccsr->row_offsets;
4788:       jj = *Ccsr->column_indices;
4789:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4790:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791:     } else {
4792:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4793:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794:     }
4795:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4796:     PetscCall(PetscMalloc1(m, &c->ilen));
4797:     PetscCall(PetscMalloc1(m, &c->imax));
4798:     c->maxnz         = c->nz;
4799:     c->nonzerorowcnt = 0;
4800:     c->rmax          = 0;
4801:     for (i = 0; i < m; i++) {
4802:       const PetscInt nn = c->i[i + 1] - c->i[i];
4803:       c->ilen[i] = c->imax[i] = nn;
4804:       c->nonzerorowcnt += (PetscInt) !!nn;
4805:       c->rmax = PetscMax(c->rmax, nn);
4806:     }
4807:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4808:     PetscCall(PetscMalloc1(c->nz, &c->a));
4809:     (*C)->nonzerostate++;
4810:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4811:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4812:     Ccusp->nonzerostate = (*C)->nonzerostate;
4813:     (*C)->preallocated  = PETSC_TRUE;
4814:   } else {
4815:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816:     c = (Mat_SeqAIJ *)(*C)->data;
4817:     if (c->nz) {
4818:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4819:       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4821:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4822:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4823:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4824:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4825:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4827:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4833:       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834:       auto pmid = Ccusp->cooPerm->begin();
4835:       thrust::advance(pmid, Acsr->num_entries);
4836:       PetscCall(PetscLogGpuTimeBegin());
4837:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4838:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839:       thrust::for_each(zibait, zieait, VecCUDAEquals());
4840:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4841:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4843:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4844:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4845:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850:         auto       vT    = CcsrT->values->begin();
4851:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4853:         (*C)->transupdated = PETSC_TRUE;
4854:       }
4855:       PetscCall(PetscLogGpuTimeEnd());
4856:     }
4857:   }
4858:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859:   (*C)->assembled     = PETSC_TRUE;
4860:   (*C)->was_assembled = PETSC_FALSE;
4861:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4862:   PetscFunctionReturn(PETSC_SUCCESS);
4863: }

4865: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4866: {
4867:   bool               dmem;
4868:   const PetscScalar *av;

4870:   PetscFunctionBegin;
4871:   dmem = isCudaMem(v);
4872:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4873:   if (n && idx) {
4874:     THRUSTINTARRAY widx(n);
4875:     widx.assign(idx, idx + n);
4876:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

4878:     THRUSTARRAY                    *w = NULL;
4879:     thrust::device_ptr<PetscScalar> dv;
4880:     if (dmem) {
4881:       dv = thrust::device_pointer_cast(v);
4882:     } else {
4883:       w  = new THRUSTARRAY(n);
4884:       dv = w->data();
4885:     }
4886:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4888:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4889:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4890:     thrust::for_each(zibit, zieit, VecCUDAEquals());
4891:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4892:     delete w;
4893:   } else {
4894:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4895:   }
4896:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4897:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4898:   PetscFunctionReturn(PETSC_SUCCESS);
4899: }