Actual source code: aijhipsparse.hip.cpp

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the HIPSPARSE library,
  4:   Portions of this code are under:
  5:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  6: */
  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #include <thrust/iterator/transform_iterator.h>
 16: #if PETSC_CPP_VERSION >= 14
 17:   #define PETSC_HAVE_THRUST_ASYNC 1
 18:   #include <thrust/async/for_each.h>
 19: #endif
 20: #include <thrust/iterator/constant_iterator.h>
 21: #include <thrust/iterator/discard_iterator.h>
 22: #include <thrust/binary_search.h>
 23: #include <thrust/remove.h>
 24: #include <thrust/sort.h>
 25: #include <thrust/unique.h>

 27: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
 28: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
 29: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
 30: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};

 32: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 33: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 34: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 35: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 36: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 37: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 38: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 40: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 42: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 43: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
 44: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
 45: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 46: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 47: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 48: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 49: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 50: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 51: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
 52: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 53: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
 54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
 55: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
 56: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **);
 57: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
 58: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
 59: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
 60: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 61: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
 62: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 63: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);

 65: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat, Mat, Mat, PetscBool, PetscBool);
 66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
 67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
 68: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat, MatFactorType, Mat *);

 70: /*
 71: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
 72: {
 73:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 75:   PetscFunctionBegin;
 76:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 77:   hipsparsestruct->stream = stream;
 78:   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
 79:   PetscFunctionReturn(PETSC_SUCCESS);
 80: }

 82: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
 83: {
 84:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 86:   PetscFunctionBegin;
 87:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 88:   if (hipsparsestruct->handle != handle) {
 89:     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
 90:     hipsparsestruct->handle = handle;
 91:   }
 92:   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
 93:   PetscFunctionReturn(PETSC_SUCCESS);
 94: }

 96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
 97: {
 98:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
 99:   PetscBool            flg;

101:   PetscFunctionBegin;
102:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
103:   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
104:   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }
107: */

109: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
110: {
111:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

113:   PetscFunctionBegin;
114:   switch (op) {
115:   case MAT_HIPSPARSE_MULT:
116:     hipsparsestruct->format = format;
117:     break;
118:   case MAT_HIPSPARSE_ALL:
119:     hipsparsestruct->format = format;
120:     break;
121:   default:
122:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
123:   }
124:   PetscFunctionReturn(PETSC_SUCCESS);
125: }

127: /*@
128:    MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
129:    operation. Only the `MatMult()` operation can use different GPU storage formats

131:    Not Collective

133:    Input Parameters:
134: +  A - Matrix of type `MATSEQAIJHIPSPARSE`
135: .  op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
136:          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
137: -  format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)

139:    Level: intermediate

141: .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
144: {
145:   PetscFunctionBegin;
147:   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
148:   PetscFunctionReturn(PETSC_SUCCESS);
149: }

151: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
152: {
153:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

155:   PetscFunctionBegin;
156:   hipsparsestruct->use_cpu_solve = use_cpu;
157:   PetscFunctionReturn(PETSC_SUCCESS);
158: }

160: /*@
161:    MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.

163:    Input Parameters:
164: +  A - Matrix of type `MATSEQAIJHIPSPARSE`
165: -  use_cpu - set flag for using the built-in CPU `MatSolve()`

167:    Level: intermediate

169:    Notes:
170:    The hipSparse LU solver currently computes the factors with the built-in CPU method
171:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
172:    This method to specifies if the solve is done on the CPU or GPU (GPU is the default).

174: .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
175: @*/
176: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
177: {
178:   PetscFunctionBegin;
180:   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
181:   PetscFunctionReturn(PETSC_SUCCESS);
182: }

184: PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
185: {
186:   PetscFunctionBegin;
187:   switch (op) {
188:   case MAT_FORM_EXPLICIT_TRANSPOSE:
189:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
190:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
191:     A->form_explicit_transpose = flg;
192:     break;
193:   default:
194:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
195:     break;
196:   }
197:   PetscFunctionReturn(PETSC_SUCCESS);
198: }

200: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
201: {
202:   PetscBool            row_identity, col_identity;
203:   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
204:   IS                   isrow = b->row, iscol = b->col;
205:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;

207:   PetscFunctionBegin;
208:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
209:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
210:   B->offloadmask = PETSC_OFFLOAD_CPU;
211:   /* determine which version of MatSolve needs to be used. */
212:   PetscCall(ISIdentity(isrow, &row_identity));
213:   PetscCall(ISIdentity(iscol, &col_identity));
214:   if (!hipsparsestruct->use_cpu_solve) {
215:     if (row_identity && col_identity) {
216:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
217:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
218:     } else {
219:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
220:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
221:     }
222:   }
223:   B->ops->matsolve          = NULL;
224:   B->ops->matsolvetranspose = NULL;

226:   /* get the triangular factors */
227:   if (!hipsparsestruct->use_cpu_solve) { PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); }
228:   PetscFunctionReturn(PETSC_SUCCESS);
229: }

231: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
232: {
233:   MatHIPSPARSEStorageFormat format;
234:   PetscBool                 flg;
235:   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

237:   PetscFunctionBegin;
238:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
239:   if (A->factortype == MAT_FACTOR_NONE) {
240:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
241:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
242:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
243:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
244:     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
245:     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
246:     PetscCall(
247:       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
248:     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
249:     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250:     PetscCall(
251:       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
252:     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253:     /*
254:     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
255:     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
256:     */
257:   }
258:   PetscOptionsHeadEnd();
259:   PetscFunctionReturn(PETSC_SUCCESS);
260: }

262: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
263: {
264:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
265:   PetscInt                            n                   = A->rmap->n;
266:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
267:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
268:   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
269:   const MatScalar                    *aa = a->a, *v;
270:   PetscInt                           *AiLo, *AjLo;
271:   PetscInt                            i, nz, nzLower, offset, rowOffset;

273:   PetscFunctionBegin;
274:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
275:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
276:     try {
277:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278:       nzLower = n + ai[n] - ai[1];
279:       if (!loTriFactor) {
280:         PetscScalar *AALo;
281:         PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));

283:         /* Allocate Space for the lower triangular matrix */
284:         PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
285:         PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));

287:         /* Fill the lower triangular matrix */
288:         AiLo[0]   = (PetscInt)0;
289:         AiLo[n]   = nzLower;
290:         AjLo[0]   = (PetscInt)0;
291:         AALo[0]   = (MatScalar)1.0;
292:         v         = aa;
293:         vi        = aj;
294:         offset    = 1;
295:         rowOffset = 1;
296:         for (i = 1; i < n; i++) {
297:           nz = ai[i + 1] - ai[i];
298:           /* additional 1 for the term on the diagonal */
299:           AiLo[i] = rowOffset;
300:           rowOffset += nz + 1;

302:           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
303:           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
304:           offset += nz;
305:           AjLo[offset] = (PetscInt)i;
306:           AALo[offset] = (MatScalar)1.0;
307:           offset += 1;
308:           v += nz;
309:           vi += nz;
310:         }

312:         /* allocate space for the triangular factor information */
313:         PetscCall(PetscNew(&loTriFactor));
314:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
315:         /* Create the matrix description */
316:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
317:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
318:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
319:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
320:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

322:         /* set the operation */
323:         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

325:         /* set the matrix */
326:         loTriFactor->csrMat                 = new CsrMatrix;
327:         loTriFactor->csrMat->num_rows       = n;
328:         loTriFactor->csrMat->num_cols       = n;
329:         loTriFactor->csrMat->num_entries    = nzLower;
330:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
331:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332:         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);

334:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
335:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
336:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

338:         /* Create the solve analysis information */
339:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
340:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
341:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
342:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
343:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

345:         /* perform the solve analysis */
346:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
347:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

349:         PetscCallHIP(WaitForHIP());
350:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

352:         /* assign the pointer */
353:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
354:         loTriFactor->AA_h                                           = AALo;
355:         PetscCallHIP(hipHostFree(AiLo));
356:         PetscCallHIP(hipHostFree(AjLo));
357:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
358:       } else { /* update values only */
359:         if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
360:         /* Fill the lower triangular matrix */
361:         loTriFactor->AA_h[0] = 1.0;
362:         v                    = aa;
363:         vi                   = aj;
364:         offset               = 1;
365:         for (i = 1; i < n; i++) {
366:           nz = ai[i + 1] - ai[i];
367:           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
368:           offset += nz;
369:           loTriFactor->AA_h[offset] = 1.0;
370:           offset += 1;
371:           v += nz;
372:         }
373:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
374:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
375:       }
376:     } catch (char *ex) {
377:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
378:     }
379:   }
380:   PetscFunctionReturn(PETSC_SUCCESS);
381: }

383: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
386:   PetscInt                            n                   = A->rmap->n;
387:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
388:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
389:   const PetscInt                     *aj = a->j, *adiag = a->diag, *vi;
390:   const MatScalar                    *aa = a->a, *v;
391:   PetscInt                           *AiUp, *AjUp;
392:   PetscInt                            i, nz, nzUpper, offset;

394:   PetscFunctionBegin;
395:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
396:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397:     try {
398:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
399:       nzUpper = adiag[0] - adiag[n];
400:       if (!upTriFactor) {
401:         PetscScalar *AAUp;
402:         PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

404:         /* Allocate Space for the upper triangular matrix */
405:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

408:         /* Fill the upper triangular matrix */
409:         AiUp[0] = (PetscInt)0;
410:         AiUp[n] = nzUpper;
411:         offset  = nzUpper;
412:         for (i = n - 1; i >= 0; i--) {
413:           v  = aa + adiag[i + 1] + 1;
414:           vi = aj + adiag[i + 1] + 1;
415:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416:           offset -= (nz + 1);               /* decrement the offset */

418:           /* first, set the diagonal elements */
419:           AjUp[offset] = (PetscInt)i;
420:           AAUp[offset] = (MatScalar)1. / v[nz];
421:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

423:           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
424:           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
425:         }

427:         /* allocate space for the triangular factor information */
428:         PetscCall(PetscNew(&upTriFactor));
429:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

431:         /* Create the matrix description */
432:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

438:         /* set the operation */
439:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

441:         /* set the matrix */
442:         upTriFactor->csrMat                 = new CsrMatrix;
443:         upTriFactor->csrMat->num_rows       = n;
444:         upTriFactor->csrMat->num_cols       = n;
445:         upTriFactor->csrMat->num_entries    = nzUpper;
446:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
447:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448:         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
449:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

453:         /* Create the solve analysis information */
454:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

460:         /* perform the solve analysis */
461:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

464:         PetscCallHIP(WaitForHIP());
465:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

467:         /* assign the pointer */
468:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469:         upTriFactor->AA_h                                           = AAUp;
470:         PetscCallHIP(hipHostFree(AiUp));
471:         PetscCallHIP(hipHostFree(AjUp));
472:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473:       } else {
474:         if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475:         /* Fill the upper triangular matrix */
476:         offset = nzUpper;
477:         for (i = n - 1; i >= 0; i--) {
478:           v  = aa + adiag[i + 1] + 1;
479:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480:           offset -= (nz + 1);               /* decrement the offset */

482:           /* first, set the diagonal elements */
483:           upTriFactor->AA_h[offset] = 1. / v[nz];
484:           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
485:         }
486:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488:       }
489:     } catch (char *ex) {
490:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491:     }
492:   }
493:   PetscFunctionReturn(PETSC_SUCCESS);
494: }

496: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497: {
498:   PetscBool                      row_identity, col_identity;
499:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
500:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501:   IS                             isrow = a->row, iscol = a->icol;
502:   PetscInt                       n = A->rmap->n;

504:   PetscFunctionBegin;
505:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506:   PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507:   PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));

509:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510:   hipsparseTriFactors->nnz = a->nz;

512:   A->offloadmask = PETSC_OFFLOAD_BOTH;
513:   /* lower triangular indices */
514:   PetscCall(ISIdentity(isrow, &row_identity));
515:   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516:     const PetscInt *r;

518:     PetscCall(ISGetIndices(isrow, &r));
519:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
521:     PetscCall(ISRestoreIndices(isrow, &r));
522:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523:   }
524:   /* upper triangular indices */
525:   PetscCall(ISIdentity(iscol, &col_identity));
526:   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527:     const PetscInt *c;

529:     PetscCall(ISGetIndices(iscol, &c));
530:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
532:     PetscCall(ISRestoreIndices(iscol, &c));
533:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534:   }
535:   PetscFunctionReturn(PETSC_SUCCESS);
536: }

538: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539: {
540:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
541:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544:   PetscInt                           *AiUp, *AjUp;
545:   PetscScalar                        *AAUp;
546:   PetscScalar                        *AALo;
547:   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548:   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
549:   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
550:   const MatScalar                    *aa = b->a, *v;

552:   PetscFunctionBegin;
553:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555:     try {
556:       PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557:       PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558:       if (!upTriFactor && !loTriFactor) {
559:         /* Allocate Space for the upper triangular matrix */
560:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

563:         /* Fill the upper triangular matrix */
564:         AiUp[0] = (PetscInt)0;
565:         AiUp[n] = nzUpper;
566:         offset  = 0;
567:         for (i = 0; i < n; i++) {
568:           /* set the pointers */
569:           v  = aa + ai[i];
570:           vj = aj + ai[i];
571:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

573:           /* first, set the diagonal elements */
574:           AjUp[offset] = (PetscInt)i;
575:           AAUp[offset] = (MatScalar)1.0 / v[nz];
576:           AiUp[i]      = offset;
577:           AALo[offset] = (MatScalar)1.0 / v[nz];

579:           offset += 1;
580:           if (nz > 0) {
581:             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
582:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
583:             for (j = offset; j < offset + nz; j++) {
584:               AAUp[j] = -AAUp[j];
585:               AALo[j] = AAUp[j] / v[nz];
586:             }
587:             offset += nz;
588:           }
589:         }

591:         /* allocate space for the triangular factor information */
592:         PetscCall(PetscNew(&upTriFactor));
593:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

595:         /* Create the matrix description */
596:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

602:         /* set the matrix */
603:         upTriFactor->csrMat                 = new CsrMatrix;
604:         upTriFactor->csrMat->num_rows       = A->rmap->n;
605:         upTriFactor->csrMat->num_cols       = A->cmap->n;
606:         upTriFactor->csrMat->num_entries    = a->nz;
607:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
608:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609:         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
610:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

614:         /* set the operation */
615:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

617:         /* Create the solve analysis information */
618:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

624:         /* perform the solve analysis */
625:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

628:         PetscCallHIP(WaitForHIP());
629:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

631:         /* assign the pointer */
632:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

634:         /* allocate space for the triangular factor information */
635:         PetscCall(PetscNew(&loTriFactor));
636:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

638:         /* Create the matrix description */
639:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

645:         /* set the operation */
646:         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;

648:         /* set the matrix */
649:         loTriFactor->csrMat                 = new CsrMatrix;
650:         loTriFactor->csrMat->num_rows       = A->rmap->n;
651:         loTriFactor->csrMat->num_cols       = A->cmap->n;
652:         loTriFactor->csrMat->num_entries    = a->nz;
653:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
654:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655:         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
656:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

660:         /* Create the solve analysis information */
661:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

667:         /* perform the solve analysis */
668:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

671:         PetscCallHIP(WaitForHIP());
672:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

674:         /* assign the pointer */
675:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

677:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678:         PetscCallHIP(hipHostFree(AiUp));
679:         PetscCallHIP(hipHostFree(AjUp));
680:       } else {
681:         /* Fill the upper triangular matrix */
682:         offset = 0;
683:         for (i = 0; i < n; i++) {
684:           /* set the pointers */
685:           v  = aa + ai[i];
686:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

688:           /* first, set the diagonal elements */
689:           AAUp[offset] = 1.0 / v[nz];
690:           AALo[offset] = 1.0 / v[nz];

692:           offset += 1;
693:           if (nz > 0) {
694:             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
695:             for (j = offset; j < offset + nz; j++) {
696:               AAUp[j] = -AAUp[j];
697:               AALo[j] = AAUp[j] / v[nz];
698:             }
699:             offset += nz;
700:           }
701:         }
702:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707:       }
708:       PetscCallHIP(hipHostFree(AAUp));
709:       PetscCallHIP(hipHostFree(AALo));
710:     } catch (char *ex) {
711:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712:     }
713:   }
714:   PetscFunctionReturn(PETSC_SUCCESS);
715: }

717: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718: {
719:   PetscBool                      perm_identity;
720:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
721:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722:   IS                             ip                  = a->row;
723:   PetscInt                       n                   = A->rmap->n;

725:   PetscFunctionBegin;
726:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727:   PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729:   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;

731:   A->offloadmask = PETSC_OFFLOAD_BOTH;
732:   /* lower triangular indices */
733:   PetscCall(ISIdentity(ip, &perm_identity));
734:   if (!perm_identity) {
735:     IS              iip;
736:     const PetscInt *irip, *rip;

738:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739:     PetscCall(ISGetIndices(iip, &irip));
740:     PetscCall(ISGetIndices(ip, &rip));
741:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743:     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744:     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745:     PetscCall(ISRestoreIndices(iip, &irip));
746:     PetscCall(ISDestroy(&iip));
747:     PetscCall(ISRestoreIndices(ip, &rip));
748:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749:   }
750:   PetscFunctionReturn(PETSC_SUCCESS);
751: }

753: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754: {
755:   PetscBool   perm_identity;
756:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
757:   IS          ip = b->row;

759:   PetscFunctionBegin;
760:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762:   B->offloadmask = PETSC_OFFLOAD_CPU;
763:   /* determine which version of MatSolve needs to be used. */
764:   PetscCall(ISIdentity(ip, &perm_identity));
765:   if (perm_identity) {
766:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768:     B->ops->matsolve          = NULL;
769:     B->ops->matsolvetranspose = NULL;
770:   } else {
771:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
772:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
773:     B->ops->matsolve          = NULL;
774:     B->ops->matsolvetranspose = NULL;
775:   }

777:   /* get the triangular factors */
778:   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779:   PetscFunctionReturn(PETSC_SUCCESS);
780: }

782: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783: {
784:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789:   hipsparseIndexBase_t                indexBase;
790:   hipsparseMatrixType_t               matrixType;
791:   hipsparseFillMode_t                 fillMode;
792:   hipsparseDiagType_t                 diagType;

794:   PetscFunctionBegin;
795:   /* allocate space for the transpose of the lower triangular factor */
796:   PetscCall(PetscNew(&loTriFactorT));
797:   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

799:   /* set the matrix descriptors of the lower triangular factor */
800:   matrixType = hipsparseGetMatType(loTriFactor->descr);
801:   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
802:   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803:   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);

805:   /* Create the matrix description */
806:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808:   PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));

812:   /* set the operation */
813:   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

815:   /* allocate GPU space for the CSC of the lower triangular factor*/
816:   loTriFactorT->csrMat                 = new CsrMatrix;
817:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
818:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
819:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
820:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

824:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
825:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829:                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830:   PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831: #endif
832: */
833:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

835:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows,
836:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
837:                           loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838:                           loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
839: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
840:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
841:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
842: #else
843:                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
844: #endif

846:   PetscCallHIP(WaitForHIP());
847:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

849:   /* Create the solve analysis information */
850:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
851:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
852:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp,
853:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
854:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
855:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
856:                                 &loTriFactorT->solveBufferSize));
857:   PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));

859:   /* perform the solve analysis */
860:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp,
861:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
862:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
863:                            loTriFactorT->csrMat->column_indices->data().get(),
864:                            loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

866:   PetscCallHIP(WaitForHIP());
867:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

869:   /* assign the pointer */
870:   ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

872:   /*********************************************/
873:   /* Now the Transpose of the Upper Tri Factor */
874:   /*********************************************/

876:   /* allocate space for the transpose of the upper triangular factor */
877:   PetscCall(PetscNew(&upTriFactorT));
878:   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

880:   /* set the matrix descriptors of the upper triangular factor */
881:   matrixType = hipsparseGetMatType(upTriFactor->descr);
882:   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
883:   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr)== HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
884:   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);

886:   /* Create the matrix description */
887:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
888:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
889:   PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
890:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
891:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));

893:   /* set the operation */
894:   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

896:   /* allocate GPU space for the CSC of the upper triangular factor*/
897:   upTriFactorT->csrMat                 = new CsrMatrix;
898:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
899:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
900:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
901:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
902:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
903:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

905:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
906:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
907: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
908:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
909:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
910:                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
911:   PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
912: #endif
913: */
914:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
915:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows,
916:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
917:                           upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
918:                           upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
919: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
920:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
921:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
922: #else
923:                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
924: #endif

926:   PetscCallHIP(WaitForHIP());
927:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

929:   /* Create the solve analysis information */
930:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
931:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
932:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp,
933:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
934:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
935:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
936:                                  &upTriFactorT->solveBufferSize));
937:   PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));

939:   /* perform the solve analysis */
940:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp,
941:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
942:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
943:                            upTriFactorT->csrMat->column_indices->data().get(),
944:                            upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

946:   PetscCallHIP(WaitForHIP());
947:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

949:   /* assign the pointer */
950:   ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
951:   PetscFunctionReturn(PETSC_SUCCESS);
952: }

954: struct PetscScalarToPetscInt {
955:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
956: };

958: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
959: {
960:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
961:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
962:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
963:   hipsparseIndexBase_t           indexBase;

965:   PetscFunctionBegin;
966:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
967:   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
968:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
969:   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
970:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
971:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
972:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
973:   PetscCall(PetscLogGpuTimeBegin());
974:   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
975:   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
976:     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
977:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
978:     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
979:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
980:     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

982:     /* set alpha and beta */
983:     PetscCallHIP(hipMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
984:     PetscCallHIP(hipMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
985:     PetscCallHIP(hipMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
986:     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
987:     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
988:     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

990:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
991:       CsrMatrix *matrixT      = new CsrMatrix;
992:       matstructT->mat         = matrixT;
993:       matrixT->num_rows       = A->cmap->n;
994:       matrixT->num_cols       = A->rmap->n;
995:       matrixT->num_entries    = a->nz;
996:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
997:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
998:       matrixT->values         = new THRUSTARRAY(a->nz);

1000:       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1001:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1003:       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1004:                                             indexBase, hipsparse_scalartype));
1005:     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
1006:       CsrMatrix *temp  = new CsrMatrix;
1007:       CsrMatrix *tempT = new CsrMatrix;
1008:       /* First convert HYB to CSR */
1009:       temp->num_rows       = A->rmap->n;
1010:       temp->num_cols       = A->cmap->n;
1011:       temp->num_entries    = a->nz;
1012:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1013:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1014:       temp->values         = new THRUSTARRAY(a->nz);

1016:       PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));

1018:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1019:       tempT->num_rows       = A->rmap->n;
1020:       tempT->num_cols       = A->cmap->n;
1021:       tempT->num_entries    = a->nz;
1022:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1023:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1024:       tempT->values         = new THRUSTARRAY(a->nz);

1026:       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1027:                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));

1029:       /* Last, convert CSC to HYB */
1030:       hipsparseHybMat_t hybMat;
1031:       PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1032:       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1033:       PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));

1035:       /* assign the pointer */
1036:       matstructT->mat = hybMat;
1037:       A->transupdated = PETSC_TRUE;
1038:       /* delete temporaries */
1039:       if (tempT) {
1040:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1041:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1042:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1043:         delete (CsrMatrix *)tempT;
1044:       }
1045:       if (temp) {
1046:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1047:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1048:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1049:         delete (CsrMatrix *)temp;
1050:       }
1051:     }
1052:   }
1053:   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1054:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1055:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1056:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1057:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1058:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1059:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1060:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1061:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1062:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1063:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1064:     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1065:       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1066:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1067:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1068:     }
1069:     if (!hipsparsestruct->csr2csc_i) {
1070:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1071:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1073:       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1074:       if (matrix->num_entries) {
1075:         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1076:            Need to verify this for ROCm.
1077:         */
1078:         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1079:                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1080:       } else {
1081:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1082:       }

1084:       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1085:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1086:     }
1087:     PetscCallThrust(
1088:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1089:   }
1090:   PetscCall(PetscLogGpuTimeEnd());
1091:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1092:   /* the compressed row indices is not used for matTranspose */
1093:   matstructT->cprowIndices = NULL;
1094:   /* assign the pointer */
1095:   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1096:   A->transupdated                                 = PETSC_TRUE;
1097:   PetscFunctionReturn(PETSC_SUCCESS);
1098: }

1100: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1101: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1102: {
1103:   PetscInt                              n = xx->map->n;
1104:   const PetscScalar                    *barray;
1105:   PetscScalar                          *xarray;
1106:   thrust::device_ptr<const PetscScalar> bGPU;
1107:   thrust::device_ptr<PetscScalar>       xGPU;
1108:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1109:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1110:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1111:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1113:   PetscFunctionBegin;
1114:   /* Analyze the matrix and create the transpose ... on the fly */
1115:   if (!loTriFactorT && !upTriFactorT) {
1116:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1117:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1118:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1119:   }

1121:   /* Get the GPU pointers */
1122:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1123:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1124:   xGPU = thrust::device_pointer_cast(xarray);
1125:   bGPU = thrust::device_pointer_cast(barray);

1127:   PetscCall(PetscLogGpuTimeBegin());
1128:   /* First, reorder with the row permutation */
1129:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);

1131:   /* First, solve U */
1132:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1133:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1135:   /* Then, solve L */
1136:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1137:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1139:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1140:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());

1142:   /* Copy the temporary to the full solution. */
1143:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);

1145:   /* restore */
1146:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1147:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1148:   PetscCall(PetscLogGpuTimeEnd());
1149:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1150:   PetscFunctionReturn(PETSC_SUCCESS);
1151: }

1153: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1154: {
1155:   const PetscScalar                  *barray;
1156:   PetscScalar                        *xarray;
1157:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1158:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1159:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1160:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1162:   PetscFunctionBegin;
1163:   /* Analyze the matrix and create the transpose ... on the fly */
1164:   if (!loTriFactorT && !upTriFactorT) {
1165:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1166:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1167:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1168:   }

1170:   /* Get the GPU pointers */
1171:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1172:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1174:   PetscCall(PetscLogGpuTimeBegin());
1175:   /* First, solve U */
1176:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1177:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1179:   /* Then, solve L */
1180:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1181:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1183:   /* restore */
1184:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1185:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1186:   PetscCall(PetscLogGpuTimeEnd());
1187:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1188:   PetscFunctionReturn(PETSC_SUCCESS);
1189: }

1191: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1192: {
1193:   const PetscScalar                    *barray;
1194:   PetscScalar                          *xarray;
1195:   thrust::device_ptr<const PetscScalar> bGPU;
1196:   thrust::device_ptr<PetscScalar>       xGPU;
1197:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1198:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1199:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1200:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1202:   PetscFunctionBegin;
1203:   /* Get the GPU pointers */
1204:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1205:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1206:   xGPU = thrust::device_pointer_cast(xarray);
1207:   bGPU = thrust::device_pointer_cast(barray);

1209:   PetscCall(PetscLogGpuTimeBegin());
1210:   /* First, reorder with the row permutation */
1211:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());

1213:   /* Next, solve L */
1214:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1215:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1217:   /* Then, solve U */
1218:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1219:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1221:   /* Last, reorder with the column permutation */
1222:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);

1224:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1225:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1226:   PetscCall(PetscLogGpuTimeEnd());
1227:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1228:   PetscFunctionReturn(PETSC_SUCCESS);
1229: }

1231: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1232: {
1233:   const PetscScalar                  *barray;
1234:   PetscScalar                        *xarray;
1235:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1236:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1237:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1238:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1240:   PetscFunctionBegin;
1241:   /* Get the GPU pointers */
1242:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1243:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1245:   PetscCall(PetscLogGpuTimeBegin());
1246:   /* First, solve L */
1247:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1248:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1250:   /* Next, solve U */
1251:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1252:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1254:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1255:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1256:   PetscCall(PetscLogGpuTimeEnd());
1257:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1258:   PetscFunctionReturn(PETSC_SUCCESS);
1259: }

1261: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1262: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1263: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1264: {
1265:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1266:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1267:   const PetscScalar             *barray;
1268:   PetscScalar                   *xarray;

1270:   PetscFunctionBegin;
1271:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1272:   PetscCall(VecHIPGetArrayRead(b, &barray));
1273:   PetscCall(PetscLogGpuTimeBegin());

1275:   /* Solve L*y = b */
1276:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1277:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1278:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1279:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!

1281:   /* Solve U*x = y */
1282:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1283:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1284:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1286:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1287:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1289:   PetscCall(PetscLogGpuTimeEnd());
1290:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1291:   PetscFunctionReturn(PETSC_SUCCESS);
1292: }

1294: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1295: {
1296:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1297:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1298:   const PetscScalar             *barray;
1299:   PetscScalar                   *xarray;

1301:   PetscFunctionBegin;
1302:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1303:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1304:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1305:                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1307:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1308:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1309:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1310:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1311:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1312:   }

1314:   if (!fs->updatedTransposeSpSVAnalysis) {
1315:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1317:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1318:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1319:   }

1321:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1322:   PetscCall(VecHIPGetArrayRead(b, &barray));
1323:   PetscCall(PetscLogGpuTimeBegin());

1325:   /* Solve Ut*y = b */
1326:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1327:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1328:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1329:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));

1331:   /* Solve Lt*x = y */
1332:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1333:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1336:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338:   PetscCall(PetscLogGpuTimeEnd());
1339:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340:   PetscFunctionReturn(PETSC_SUCCESS);
1341: }

1343: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344: {
1345:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1347:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348:   CsrMatrix                     *Acsr;
1349:   PetscInt                       m, nz;
1350:   PetscBool                      flg;

1352:   PetscFunctionBegin;
1353:   if (PetscDefined(USE_DEBUG)) {
1354:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356:   }

1358:   /* Copy A's value to fact */
1359:   m  = fact->rmap->n;
1360:   nz = aij->nz;
1361:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1363:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1365:   /* Factorize fact inplace */
1366:   if (m)
1367:     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368:                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369:   if (PetscDefined(USE_DEBUG)) {
1370:     int               numerical_zero;
1371:     hipsparseStatus_t status;
1372:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374:   }

1376:   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1379:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1381:   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1384:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1385:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387:   fact->ops->matsolve          = NULL;
1388:   fact->ops->matsolvetranspose = NULL;
1389:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390:   PetscFunctionReturn(PETSC_SUCCESS);
1391: }

1393: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394: {
1395:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1397:   PetscInt                       m, nz;

1399:   PetscFunctionBegin;
1400:   if (PetscDefined(USE_DEBUG)) {
1401:     PetscInt  i;
1402:     PetscBool flg, missing;

1404:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1405:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1406:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1407:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1408:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1409:   }

1411:   /* Free the old stale stuff */
1412:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1414:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1415:      but they will not be used. Allocate them just for easy debugging.
1416:    */
1417:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1419:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1420:   fact->factortype             = MAT_FACTOR_ILU;
1421:   fact->info.factor_mallocs    = 0;
1422:   fact->info.fill_ratio_given  = info->fill;
1423:   fact->info.fill_ratio_needed = 1.0;

1425:   aij->row = NULL;
1426:   aij->col = NULL;

1428:   /* ====================================================================== */
1429:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1430:   /* We'll do in-place factorization on fact                                */
1431:   /* ====================================================================== */
1432:   const int *Ai, *Aj;

1434:   m  = fact->rmap->n;
1435:   nz = aij->nz;

1437:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1438:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1439:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1440:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1441:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1444:   /* ====================================================================== */
1445:   /* Create descriptors for M, L, U                                         */
1446:   /* ====================================================================== */
1447:   hipsparseFillMode_t fillMode;
1448:   hipsparseDiagType_t diagType;

1450:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1451:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1452:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1454:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1455:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1456:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1457:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1458:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1459:   */
1460:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1461:   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1462:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1463:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1464:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1466:   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1467:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1468:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1469:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1470:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1472:   /* ========================================================================= */
1473:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1474:   /* ========================================================================= */
1475:   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1476:   if (m)
1477:     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1478:                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1480:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1481:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1483:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1484:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1486:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1487:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1489:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1490:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1492:   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1493:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1494:    */
1495:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1496:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1497:     fs->spsvBuffer_L = fs->factBuffer_M;
1498:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1499:   } else {
1500:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1501:     fs->spsvBuffer_U = fs->factBuffer_M;
1502:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1503:   }

1505:   /* ========================================================================== */
1506:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1507:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1508:   /* ========================================================================== */
1509:   int structural_zero;

1511:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1512:   if (m)
1513:     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1514:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1515:   if (PetscDefined(USE_DEBUG)) {
1516:     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1517:     hipsparseStatus_t status;
1518:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1519:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1520:   }

1522:   /* Estimate FLOPs of the numeric factorization */
1523:   {
1524:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1525:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1526:     PetscLogDouble flops = 0.0;

1528:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1529:     Ai    = Aseq->i;
1530:     Adiag = Aseq->diag;
1531:     for (PetscInt i = 0; i < m; i++) {
1532:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1533:         nzRow  = Ai[i + 1] - Ai[i];
1534:         nzLeft = Adiag[i] - Ai[i];
1535:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1536:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1537:         */
1538:         nzLeft = (nzRow - 1) / 2;
1539:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1540:       }
1541:     }
1542:     fs->numericFactFlops = flops;
1543:   }
1544:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1545:   PetscFunctionReturn(PETSC_SUCCESS);
1546: }

1548: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1549: {
1550:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1551:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1552:   const PetscScalar             *barray;
1553:   PetscScalar                   *xarray;

1555:   PetscFunctionBegin;
1556:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1557:   PetscCall(VecHIPGetArrayRead(b, &barray));
1558:   PetscCall(PetscLogGpuTimeBegin());

1560:   /* Solve L*y = b */
1561:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1562:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1563:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1566:   /* Solve Lt*x = y */
1567:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1568:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1569:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1571:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1572:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1574:   PetscCall(PetscLogGpuTimeEnd());
1575:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1576:   PetscFunctionReturn(PETSC_SUCCESS);
1577: }

1579: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1580: {
1581:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1582:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1583:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1584:   CsrMatrix                     *Acsr;
1585:   PetscInt                       m, nz;
1586:   PetscBool                      flg;

1588:   PetscFunctionBegin;
1589:   if (PetscDefined(USE_DEBUG)) {
1590:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1591:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1592:   }

1594:   /* Copy A's value to fact */
1595:   m  = fact->rmap->n;
1596:   nz = aij->nz;
1597:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1598:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1599:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1601:   /* Factorize fact inplace */
1602:   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1603:      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1604:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1605:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1606:    */
1607:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1608:   if (PetscDefined(USE_DEBUG)) {
1609:     int               numerical_zero;
1610:     hipsparseStatus_t status;
1611:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1612:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1613:   }

1615:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1617:   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1618:     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1619:   */
1620:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1622:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1623:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1624:   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1625:   fact->ops->matsolve          = NULL;
1626:   fact->ops->matsolvetranspose = NULL;
1627:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1628:   PetscFunctionReturn(PETSC_SUCCESS);
1629: }

1631: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1632: {
1633:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1634:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1635:   PetscInt                       m, nz;

1637:   PetscFunctionBegin;
1638:   if (PetscDefined(USE_DEBUG)) {
1639:     PetscInt  i;
1640:     PetscBool flg, missing;

1642:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1643:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1644:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1645:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1646:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1647:   }

1649:   /* Free the old stale stuff */
1650:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1652:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1653:      but they will not be used. Allocate them just for easy debugging.
1654:    */
1655:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1657:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1658:   fact->factortype             = MAT_FACTOR_ICC;
1659:   fact->info.factor_mallocs    = 0;
1660:   fact->info.fill_ratio_given  = info->fill;
1661:   fact->info.fill_ratio_needed = 1.0;

1663:   aij->row = NULL;
1664:   aij->col = NULL;

1666:   /* ====================================================================== */
1667:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1668:   /* We'll do in-place factorization on fact                                */
1669:   /* ====================================================================== */
1670:   const int *Ai, *Aj;

1672:   m  = fact->rmap->n;
1673:   nz = aij->nz;

1675:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1676:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1677:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1678:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1679:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1680:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1682:   /* ====================================================================== */
1683:   /* Create mat descriptors for M, L                                        */
1684:   /* ====================================================================== */
1685:   hipsparseFillMode_t fillMode;
1686:   hipsparseDiagType_t diagType;

1688:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1689:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1690:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1692:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1693:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1694:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1695:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1696:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1697:   */
1698:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1699:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1700:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1701:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1702:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1704:   /* ========================================================================= */
1705:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1706:   /* ========================================================================= */
1707:   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1708:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));

1710:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1711:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1713:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1714:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1716:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1717:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1719:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1720:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1722:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1723:      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1724:    */
1725:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1726:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1727:     fs->spsvBuffer_L = fs->factBuffer_M;
1728:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1729:   } else {
1730:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1731:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1732:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1733:   }

1735:   /* ========================================================================== */
1736:   /* Perform analysis of ic0 on M                                               */
1737:   /* The lower triangular part of M has the same sparsity pattern as L          */
1738:   /* ========================================================================== */
1739:   int structural_zero;

1741:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1742:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1743:   if (PetscDefined(USE_DEBUG)) {
1744:     hipsparseStatus_t status;
1745:     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1746:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1747:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1748:   }

1750:   /* Estimate FLOPs of the numeric factorization */
1751:   {
1752:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1753:     PetscInt      *Ai, nzRow, nzLeft;
1754:     PetscLogDouble flops = 0.0;

1756:     Ai = Aseq->i;
1757:     for (PetscInt i = 0; i < m; i++) {
1758:       nzRow = Ai[i + 1] - Ai[i];
1759:       if (nzRow > 1) {
1760:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1761:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1762:         */
1763:         nzLeft = (nzRow - 1) / 2;
1764:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1765:       }
1766:     }
1767:     fs->numericFactFlops = flops;
1768:   }
1769:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1770:   PetscFunctionReturn(PETSC_SUCCESS);
1771: }
1772: #endif

1774: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1775: {
1776:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1778:   PetscFunctionBegin;
1779: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1780:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1781:   if (hipsparseTriFactors->factorizeOnDevice) {
1782:     PetscCall(ISIdentity(isrow, &row_identity));
1783:     PetscCall(ISIdentity(iscol, &col_identity));
1784:   }
1785:   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1786:   else
1787: #endif
1788:   {
1789:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1790:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1791:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1792:   }
1793:   PetscFunctionReturn(PETSC_SUCCESS);
1794: }

1796: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1797: {
1798:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1800:   PetscFunctionBegin;
1801:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1802:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1803:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1804:   PetscFunctionReturn(PETSC_SUCCESS);
1805: }

1807: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1808: {
1809:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1811:   PetscFunctionBegin;
1812: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1813:   PetscBool perm_identity = PETSC_FALSE;
1814:   if (hipsparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1815:   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1816:   else
1817: #endif
1818:   {
1819:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1820:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1821:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1822:   }
1823:   PetscFunctionReturn(PETSC_SUCCESS);
1824: }

1826: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1827: {
1828:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1830:   PetscFunctionBegin;
1831:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1832:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1833:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1834:   PetscFunctionReturn(PETSC_SUCCESS);
1835: }

1837: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1838: {
1839:   PetscFunctionBegin;
1840:   *type = MATSOLVERHIPSPARSE;
1841:   PetscFunctionReturn(PETSC_SUCCESS);
1842: }

1844: /*MC
1845:   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1846:   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1847:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1848:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1849:   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1850:   algorithms are not recommended. This class does NOT support direct solver operations.

1852:   Level: beginner

1854: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1855: M*/

1857: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1858: {
1859:   PetscInt  n = A->rmap->n;
1860:   PetscBool factOnDevice, factOnHost;
1861:   char     *prefix;
1862:   char      factPlace[32] = "device"; /* the default */

1864:   PetscFunctionBegin;
1865:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1866:   PetscCall(MatSetSizes(*B, n, n, n, n));
1867:   (*B)->factortype = ftype;
1868:   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));

1870:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1871:   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1872:   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1873:   PetscOptionsEnd();
1874:   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1875:   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1876:   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1877:   ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

1879:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1880:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1881:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1882:     if (!A->boundtocpu) {
1883:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1884:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1885:     } else {
1886:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1887:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1888:     }
1889:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1890:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1891:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1892:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1893:     if (!A->boundtocpu) {
1894:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1895:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1896:     } else {
1897:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1898:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1899:     }
1900:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1901:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1902:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");

1904:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1905:   (*B)->canuseordering = PETSC_TRUE;
1906:   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1907:   PetscFunctionReturn(PETSC_SUCCESS);
1908: }

1910: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1911: {
1912:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1913:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1914: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1915:   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1916: #endif

1918:   PetscFunctionBegin;
1919:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1920:     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1921:     if (A->factortype == MAT_FACTOR_NONE) {
1922:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1923:       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1924:     }
1925: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1926:     else if (fs->csrVal) {
1927:       /* We have a factorized matrix on device and are able to copy it to host */
1928:       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1929:     }
1930: #endif
1931:     else
1932:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1933:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1934:     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1935:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1936:   }
1937:   PetscFunctionReturn(PETSC_SUCCESS);
1938: }

1940: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1941: {
1942:   PetscFunctionBegin;
1943:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1944:   *array = ((Mat_SeqAIJ *)A->data)->a;
1945:   PetscFunctionReturn(PETSC_SUCCESS);
1946: }

1948: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1949: {
1950:   PetscFunctionBegin;
1951:   A->offloadmask = PETSC_OFFLOAD_CPU;
1952:   *array         = NULL;
1953:   PetscFunctionReturn(PETSC_SUCCESS);
1954: }

1956: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1957: {
1958:   PetscFunctionBegin;
1959:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1960:   *array = ((Mat_SeqAIJ *)A->data)->a;
1961:   PetscFunctionReturn(PETSC_SUCCESS);
1962: }

1964: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1965: {
1966:   PetscFunctionBegin;
1967:   *array = NULL;
1968:   PetscFunctionReturn(PETSC_SUCCESS);
1969: }

1971: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1972: {
1973:   PetscFunctionBegin;
1974:   *array = ((Mat_SeqAIJ *)A->data)->a;
1975:   PetscFunctionReturn(PETSC_SUCCESS);
1976: }

1978: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1979: {
1980:   PetscFunctionBegin;
1981:   A->offloadmask = PETSC_OFFLOAD_CPU;
1982:   *array         = NULL;
1983:   PetscFunctionReturn(PETSC_SUCCESS);
1984: }

1986: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1987: {
1988:   Mat_SeqAIJHIPSPARSE *cusp;
1989:   CsrMatrix           *matrix;

1991:   PetscFunctionBegin;
1992:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1993:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1994:   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1995:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1996:   matrix = (CsrMatrix *)cusp->mat->mat;

1998:   if (i) {
1999: #if !defined(PETSC_USE_64BIT_INDICES)
2000:     *i = matrix->row_offsets->data().get();
2001: #else
2002:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2003: #endif
2004:   }
2005:   if (j) {
2006: #if !defined(PETSC_USE_64BIT_INDICES)
2007:     *j = matrix->column_indices->data().get();
2008: #else
2009:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2010: #endif
2011:   }
2012:   if (a) *a = matrix->values->data().get();
2013:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2014:   PetscFunctionReturn(PETSC_SUCCESS);
2015: }

2017: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2018: {
2019:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2020:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
2021:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2022:   PetscBool                      both            = PETSC_TRUE;
2023:   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;

2025:   PetscFunctionBegin;
2026:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2027:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2028:     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2029:       CsrMatrix *matrix;
2030:       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;

2032:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2033:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2034:       matrix->values->assign(a->a, a->a + a->nz);
2035:       PetscCallHIP(WaitForHIP());
2036:       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2037:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2038:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2039:     } else {
2040:       PetscInt nnz;
2041:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2042:       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2043:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2044:       delete hipsparsestruct->workVector;
2045:       delete hipsparsestruct->rowoffsets_gpu;
2046:       hipsparsestruct->workVector     = NULL;
2047:       hipsparsestruct->rowoffsets_gpu = NULL;
2048:       try {
2049:         if (a->compressedrow.use) {
2050:           m    = a->compressedrow.nrows;
2051:           ii   = a->compressedrow.i;
2052:           ridx = a->compressedrow.rindex;
2053:         } else {
2054:           m    = A->rmap->n;
2055:           ii   = a->i;
2056:           ridx = NULL;
2057:         }
2058:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2059:         if (!a->a) {
2060:           nnz  = ii[m];
2061:           both = PETSC_FALSE;
2062:         } else nnz = a->nz;
2063:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2065:         /* create hipsparse matrix */
2066:         hipsparsestruct->nrows = m;
2067:         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2068:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2069:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2070:         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

2072:         PetscCallHIP(hipMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2073:         PetscCallHIP(hipMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2074:         PetscCallHIP(hipMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2075:         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2076:         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2077:         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2078:         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));

2080:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2081:         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2082:           /* set the matrix */
2083:           CsrMatrix *mat      = new CsrMatrix;
2084:           mat->num_rows       = m;
2085:           mat->num_cols       = A->cmap->n;
2086:           mat->num_entries    = nnz;
2087:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2088:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2089:           mat->values         = new THRUSTARRAY(nnz);
2090:           mat->row_offsets->assign(ii, ii + m + 1);
2091:           mat->column_indices->assign(a->j, a->j + nnz);
2092:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2094:           /* assign the pointer */
2095:           matstruct->mat = mat;
2096:           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2097:             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2098:                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2099:           }
2100:         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2101:           CsrMatrix *mat      = new CsrMatrix;
2102:           mat->num_rows       = m;
2103:           mat->num_cols       = A->cmap->n;
2104:           mat->num_entries    = nnz;
2105:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2106:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2107:           mat->values         = new THRUSTARRAY(nnz);
2108:           mat->row_offsets->assign(ii, ii + m + 1);
2109:           mat->column_indices->assign(a->j, a->j + nnz);
2110:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2112:           hipsparseHybMat_t hybMat;
2113:           PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2114:           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2115:           PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2116:           /* assign the pointer */
2117:           matstruct->mat = hybMat;

2119:           if (mat) {
2120:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2121:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2122:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2123:             delete (CsrMatrix *)mat;
2124:           }
2125:         }

2127:         /* assign the compressed row indices */
2128:         if (a->compressedrow.use) {
2129:           hipsparsestruct->workVector = new THRUSTARRAY(m);
2130:           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2131:           matstruct->cprowIndices->assign(ridx, ridx + m);
2132:           tmp = m;
2133:         } else {
2134:           hipsparsestruct->workVector = NULL;
2135:           matstruct->cprowIndices     = NULL;
2136:           tmp                         = 0;
2137:         }
2138:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2140:         /* assign the pointer */
2141:         hipsparsestruct->mat = matstruct;
2142:       } catch (char *ex) {
2143:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2144:       }
2145:       PetscCallHIP(WaitForHIP());
2146:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2147:       hipsparsestruct->nonzerostate = A->nonzerostate;
2148:     }
2149:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2150:   }
2151:   PetscFunctionReturn(PETSC_SUCCESS);
2152: }

2154: struct VecHIPPlusEquals {
2155:   template <typename Tuple>
2156:   __host__ __device__ void operator()(Tuple t)
2157:   {
2158:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2159:   }
2160: };

2162: struct VecHIPEquals {
2163:   template <typename Tuple>
2164:   __host__ __device__ void operator()(Tuple t)
2165:   {
2166:     thrust::get<1>(t) = thrust::get<0>(t);
2167:   }
2168: };

2170: struct VecHIPEqualsReverse {
2171:   template <typename Tuple>
2172:   __host__ __device__ void operator()(Tuple t)
2173:   {
2174:     thrust::get<0>(t) = thrust::get<1>(t);
2175:   }
2176: };

2178: struct MatMatHipsparse {
2179:   PetscBool             cisdense;
2180:   PetscScalar          *Bt;
2181:   Mat                   X;
2182:   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2183:   PetscLogDouble        flops;
2184:   CsrMatrix            *Bcsr;
2185:   hipsparseSpMatDescr_t matSpBDescr;
2186:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2187:   hipsparseDnMatDescr_t matBDescr;
2188:   hipsparseDnMatDescr_t matCDescr;
2189:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2190: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2191:   void *dBuffer4, *dBuffer5;
2192: #endif
2193:   size_t                 mmBufferSize;
2194:   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2195:   hipsparseSpGEMMDescr_t spgemmDesc;
2196: };

2198: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2199: {
2200:   MatMatHipsparse *mmdata = (MatMatHipsparse *)data;

2202:   PetscFunctionBegin;
2203:   PetscCallHIP(hipFree(mmdata->Bt));
2204:   delete mmdata->Bcsr;
2205:   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2206:   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2207:   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2208:   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2209: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2210:   if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2211:   if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2212: #endif
2213:   if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2214:   if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2215:   PetscCall(MatDestroy(&mmdata->X));
2216:   PetscCall(PetscFree(data));
2217:   PetscFunctionReturn(PETSC_SUCCESS);
2218: }

2220: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2221: {
2222:   Mat_Product                   *product = C->product;
2223:   Mat                            A, B;
2224:   PetscInt                       m, n, blda, clda;
2225:   PetscBool                      flg, biship;
2226:   Mat_SeqAIJHIPSPARSE           *cusp;
2227:   hipsparseOperation_t           opA;
2228:   const PetscScalar             *barray;
2229:   PetscScalar                   *carray;
2230:   MatMatHipsparse               *mmdata;
2231:   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2232:   CsrMatrix                     *csrmat;

2234:   PetscFunctionBegin;
2235:   MatCheckProduct(C, 1);
2236:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2237:   mmdata = (MatMatHipsparse *)product->data;
2238:   A      = product->A;
2239:   B      = product->B;
2240:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2241:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2242:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2243:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2244:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2245:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2246:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2247:   switch (product->type) {
2248:   case MATPRODUCT_AB:
2249:   case MATPRODUCT_PtAP:
2250:     mat = cusp->mat;
2251:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2252:     m   = A->rmap->n;
2253:     n   = B->cmap->n;
2254:     break;
2255:   case MATPRODUCT_AtB:
2256:     if (!A->form_explicit_transpose) {
2257:       mat = cusp->mat;
2258:       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2259:     } else {
2260:       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2261:       mat = cusp->matTranspose;
2262:       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2263:     }
2264:     m = A->cmap->n;
2265:     n = B->cmap->n;
2266:     break;
2267:   case MATPRODUCT_ABt:
2268:   case MATPRODUCT_RARt:
2269:     mat = cusp->mat;
2270:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2271:     m   = A->rmap->n;
2272:     n   = B->rmap->n;
2273:     break;
2274:   default:
2275:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2276:   }
2277:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2278:   csrmat = (CsrMatrix *)mat->mat;
2279:   /* if the user passed a CPU matrix, copy the data to the GPU */
2280:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2281:   if (!biship) { PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); }
2282:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2283:   PetscCall(MatDenseGetLDA(B, &blda));
2284:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2285:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2286:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2287:   } else {
2288:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2289:     PetscCall(MatDenseGetLDA(C, &clda));
2290:   }

2292:   PetscCall(PetscLogGpuTimeBegin());
2293:   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2294:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2295:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2296:     size_t mmBufferSize;
2297:     if (mmdata->initialized && mmdata->Blda != blda) {
2298:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2299:       mmdata->matBDescr = NULL;
2300:     }
2301:     if (!mmdata->matBDescr) {
2302:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2303:       mmdata->Blda = blda;
2304:     }
2305:     if (mmdata->initialized && mmdata->Clda != clda) {
2306:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2307:       mmdata->matCDescr = NULL;
2308:     }
2309:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2310:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2311:       mmdata->Clda = clda;
2312:     }
2313:     if (!mat->matDescr) {
2314:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2315:                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2316:     }
2317:     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2318:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2319:       PetscCallHIP(hipFree(mmdata->mmBuffer));
2320:       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2321:       mmdata->mmBufferSize = mmBufferSize;
2322:     }
2323:     mmdata->initialized = PETSC_TRUE;
2324:   } else {
2325:     /* to be safe, always update pointers of the mats */
2326:     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2327:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2328:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2329:   }

2331:   /* do hipsparseSpMM, which supports transpose on B */
2332:   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

2334:   PetscCall(PetscLogGpuTimeEnd());
2335:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2336:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2337:   if (product->type == MATPRODUCT_RARt) {
2338:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2339:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2340:   } else if (product->type == MATPRODUCT_PtAP) {
2341:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2342:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2343:   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2344:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2345:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2346:   PetscFunctionReturn(PETSC_SUCCESS);
2347: }

2349: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2350: {
2351:   Mat_Product         *product = C->product;
2352:   Mat                  A, B;
2353:   PetscInt             m, n;
2354:   PetscBool            cisdense, flg;
2355:   MatMatHipsparse     *mmdata;
2356:   Mat_SeqAIJHIPSPARSE *cusp;

2358:   PetscFunctionBegin;
2359:   MatCheckProduct(C, 1);
2360:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2361:   A = product->A;
2362:   B = product->B;
2363:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2364:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2365:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2366:   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2367:   switch (product->type) {
2368:   case MATPRODUCT_AB:
2369:     m = A->rmap->n;
2370:     n = B->cmap->n;
2371:     break;
2372:   case MATPRODUCT_AtB:
2373:     m = A->cmap->n;
2374:     n = B->cmap->n;
2375:     break;
2376:   case MATPRODUCT_ABt:
2377:     m = A->rmap->n;
2378:     n = B->rmap->n;
2379:     break;
2380:   case MATPRODUCT_PtAP:
2381:     m = B->cmap->n;
2382:     n = B->cmap->n;
2383:     break;
2384:   case MATPRODUCT_RARt:
2385:     m = B->rmap->n;
2386:     n = B->rmap->n;
2387:     break;
2388:   default:
2389:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2390:   }
2391:   PetscCall(MatSetSizes(C, m, n, m, n));
2392:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2393:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2394:   PetscCall(MatSetType(C, MATSEQDENSEHIP));

2396:   /* product data */
2397:   PetscCall(PetscNew(&mmdata));
2398:   mmdata->cisdense = cisdense;
2399:   /* for these products we need intermediate storage */
2400:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2401:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2402:     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2403:     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2404:     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2405:     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2406:   }
2407:   C->product->data       = mmdata;
2408:   C->product->destroy    = MatDestroy_MatMatHipsparse;
2409:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2410:   PetscFunctionReturn(PETSC_SUCCESS);
2411: }

2413: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2414: {
2415:   Mat_Product                   *product = C->product;
2416:   Mat                            A, B;
2417:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2418:   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2419:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2420:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2421:   PetscBool                      flg;
2422:   MatProductType                 ptype;
2423:   MatMatHipsparse               *mmdata;
2424:   hipsparseSpMatDescr_t          BmatSpDescr;
2425:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2427:   PetscFunctionBegin;
2428:   MatCheckProduct(C, 1);
2429:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2430:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2431:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2432:   mmdata = (MatMatHipsparse *)C->product->data;
2433:   A      = product->A;
2434:   B      = product->B;
2435:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2436:     mmdata->reusesym = PETSC_FALSE;
2437:     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2438:     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2439:     Cmat = Ccusp->mat;
2440:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2441:     Ccsr = (CsrMatrix *)Cmat->mat;
2442:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2443:     goto finalize;
2444:   }
2445:   if (!c->nz) goto finalize;
2446:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2447:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2448:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2449:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2450:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2451:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2452:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2453:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2454:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2455:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2456:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2457:   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2458:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2459:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));

2461:   ptype = product->type;
2462:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2463:     ptype = MATPRODUCT_AB;
2464:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2465:   }
2466:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2467:     ptype = MATPRODUCT_AB;
2468:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2469:   }
2470:   switch (ptype) {
2471:   case MATPRODUCT_AB:
2472:     Amat = Acusp->mat;
2473:     Bmat = Bcusp->mat;
2474:     break;
2475:   case MATPRODUCT_AtB:
2476:     Amat = Acusp->matTranspose;
2477:     Bmat = Bcusp->mat;
2478:     break;
2479:   case MATPRODUCT_ABt:
2480:     Amat = Acusp->mat;
2481:     Bmat = Bcusp->matTranspose;
2482:     break;
2483:   default:
2484:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2485:   }
2486:   Cmat = Ccusp->mat;
2487:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2488:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2489:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2490:   Acsr = (CsrMatrix *)Amat->mat;
2491:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2492:   Ccsr = (CsrMatrix *)Cmat->mat;
2493:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2494:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2495:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2496:   PetscCall(PetscLogGpuTimeBegin());
2497: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2498:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2499:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2500:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2501:   PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2502:   #else
2503:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2504:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2505:   #endif
2506: #else
2507:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2508:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2509:                                           Ccsr->column_indices->data().get()));
2510: #endif
2511:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2512:   PetscCallHIP(WaitForHIP());
2513:   PetscCall(PetscLogGpuTimeEnd());
2514:   C->offloadmask = PETSC_OFFLOAD_GPU;
2515: finalize:
2516:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2517:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2518:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2519:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2520:   c->reallocs = 0;
2521:   C->info.mallocs += 0;
2522:   C->info.nz_unneeded = 0;
2523:   C->assembled = C->was_assembled = PETSC_TRUE;
2524:   C->num_ass++;
2525:   PetscFunctionReturn(PETSC_SUCCESS);
2526: }

2528: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2529: {
2530:   Mat_Product                   *product = C->product;
2531:   Mat                            A, B;
2532:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2533:   Mat_SeqAIJ                    *a, *b, *c;
2534:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2535:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2536:   PetscInt                       i, j, m, n, k;
2537:   PetscBool                      flg;
2538:   MatProductType                 ptype;
2539:   MatMatHipsparse               *mmdata;
2540:   PetscLogDouble                 flops;
2541:   PetscBool                      biscompressed, ciscompressed;
2542: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2543:   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2544:   hipsparseSpMatDescr_t BmatSpDescr;
2545: #else
2546:   int cnz;
2547: #endif
2548:   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2550:   PetscFunctionBegin;
2551:   MatCheckProduct(C, 1);
2552:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2553:   A = product->A;
2554:   B = product->B;
2555:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2556:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2557:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2558:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2559:   a = (Mat_SeqAIJ *)A->data;
2560:   b = (Mat_SeqAIJ *)B->data;
2561:   /* product data */
2562:   PetscCall(PetscNew(&mmdata));
2563:   C->product->data    = mmdata;
2564:   C->product->destroy = MatDestroy_MatMatHipsparse;

2566:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2567:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2568:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2569:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2570:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2571:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");

2573:   ptype = product->type;
2574:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2575:     ptype                                          = MATPRODUCT_AB;
2576:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2577:   }
2578:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2579:     ptype                                          = MATPRODUCT_AB;
2580:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2581:   }
2582:   biscompressed = PETSC_FALSE;
2583:   ciscompressed = PETSC_FALSE;
2584:   switch (ptype) {
2585:   case MATPRODUCT_AB:
2586:     m    = A->rmap->n;
2587:     n    = B->cmap->n;
2588:     k    = A->cmap->n;
2589:     Amat = Acusp->mat;
2590:     Bmat = Bcusp->mat;
2591:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2592:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2593:     break;
2594:   case MATPRODUCT_AtB:
2595:     m = A->cmap->n;
2596:     n = B->cmap->n;
2597:     k = A->rmap->n;
2598:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2599:     Amat = Acusp->matTranspose;
2600:     Bmat = Bcusp->mat;
2601:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2602:     break;
2603:   case MATPRODUCT_ABt:
2604:     m = A->rmap->n;
2605:     n = B->rmap->n;
2606:     k = A->cmap->n;
2607:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2608:     Amat = Acusp->mat;
2609:     Bmat = Bcusp->matTranspose;
2610:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2611:     break;
2612:   default:
2613:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2614:   }

2616:   /* create hipsparse matrix */
2617:   PetscCall(MatSetSizes(C, m, n, m, n));
2618:   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2619:   c     = (Mat_SeqAIJ *)C->data;
2620:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2621:   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2622:   Ccsr  = new CsrMatrix;

2624:   c->compressedrow.use = ciscompressed;
2625:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2626:     c->compressedrow.nrows = a->compressedrow.nrows;
2627:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2628:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2629:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2630:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2631:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2632:   } else {
2633:     c->compressedrow.nrows  = 0;
2634:     c->compressedrow.i      = NULL;
2635:     c->compressedrow.rindex = NULL;
2636:     Ccusp->workVector       = NULL;
2637:     Cmat->cprowIndices      = NULL;
2638:   }
2639:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2640:   Ccusp->mat        = Cmat;
2641:   Ccusp->mat->mat   = Ccsr;
2642:   Ccsr->num_rows    = Ccusp->nrows;
2643:   Ccsr->num_cols    = n;
2644:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2645:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2646:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2647:   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2648:   PetscCallHIP(hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2649:   PetscCallHIP(hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2650:   PetscCallHIP(hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2651:   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2652:   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2653:   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2654:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2655:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2656:     c->nz                = 0;
2657:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2658:     Ccsr->values         = new THRUSTARRAY(c->nz);
2659:     goto finalizesym;
2660:   }

2662:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2663:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2664:   Acsr = (CsrMatrix *)Amat->mat;
2665:   if (!biscompressed) {
2666:     Bcsr        = (CsrMatrix *)Bmat->mat;
2667:     BmatSpDescr = Bmat->matDescr;
2668:   } else { /* we need to use row offsets for the full matrix */
2669:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2670:     Bcsr                 = new CsrMatrix;
2671:     Bcsr->num_rows       = B->rmap->n;
2672:     Bcsr->num_cols       = cBcsr->num_cols;
2673:     Bcsr->num_entries    = cBcsr->num_entries;
2674:     Bcsr->column_indices = cBcsr->column_indices;
2675:     Bcsr->values         = cBcsr->values;
2676:     if (!Bcusp->rowoffsets_gpu) {
2677:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2678:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2679:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2680:     }
2681:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2682:     mmdata->Bcsr      = Bcsr;
2683:     if (Bcsr->num_rows && Bcsr->num_cols) {
2684:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2685:     }
2686:     BmatSpDescr = mmdata->matSpBDescr;
2687:   }
2688:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2689:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2690:   /* precompute flops count */
2691:   if (ptype == MATPRODUCT_AB) {
2692:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2693:       const PetscInt st = a->i[i];
2694:       const PetscInt en = a->i[i + 1];
2695:       for (j = st; j < en; j++) {
2696:         const PetscInt brow = a->j[j];
2697:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2698:       }
2699:     }
2700:   } else if (ptype == MATPRODUCT_AtB) {
2701:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2702:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2703:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2704:       flops += (2. * anzi) * bnzi;
2705:     }
2706:   } else flops = 0.; /* TODO */

2708:   mmdata->flops = flops;
2709:   PetscCall(PetscLogGpuTimeBegin());
2710: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2711:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2712:   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2713:   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2714:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2715:   {
2716:     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2717:      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2718:   */
2719:     void *dBuffer1 = NULL;
2720:     void *dBuffer2 = NULL;
2721:     void *dBuffer3 = NULL;
2722:     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2723:     size_t bufferSize1 = 0;
2724:     size_t bufferSize2 = 0;
2725:     size_t bufferSize3 = 0;
2726:     size_t bufferSize4 = 0;
2727:     size_t bufferSize5 = 0;

2729:     /* ask bufferSize1 bytes for external memory */
2730:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2731:     PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2732:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2733:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));

2735:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2736:     PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2737:     PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2738:     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2739:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2740:     PetscCallHIP(hipFree(dBuffer1));
2741:     PetscCallHIP(hipFree(dBuffer2));

2743:     /* get matrix C non-zero entries C_nnz1 */
2744:     PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2745:     c->nz = (PetscInt)C_nnz1;
2746:     /* allocate matrix C */
2747:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2748:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2749:     Ccsr->values = new THRUSTARRAY(c->nz);
2750:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2751:     /* update matC with the new pointers */
2752:     PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));

2754:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2755:     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2756:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2757:     PetscCallHIP(hipFree(dBuffer3));
2758:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2759:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2760:   }
2761:   #else
2762:   size_t bufSize2;
2763:   /* ask bufferSize bytes for external memory */
2764:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2765:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2766:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2767:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2768:   /* ask bufferSize again bytes for external memory */
2769:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2770:   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2771:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2772:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2773:      is stored in the descriptor! What a messy API... */
2774:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2775:   /* compute the intermediate product of A * B */
2776:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2777:   /* get matrix C non-zero entries C_nnz1 */
2778:   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2779:   c->nz = (PetscInt)C_nnz1;
2780:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2781:                       mmdata->mmBufferSize / 1024));
2782:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2783:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2784:   Ccsr->values = new THRUSTARRAY(c->nz);
2785:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2786:   PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2787:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2788:   #endif
2789: #else
2790:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2791:   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2792:                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2793:   c->nz = cnz;
2794:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2795:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2796:   Ccsr->values = new THRUSTARRAY(c->nz);
2797:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */

2799:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2800:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2801:       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2802:       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2803:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2804:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2805:                                           Ccsr->column_indices->data().get()));
2806: #endif
2807:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2808:   PetscCall(PetscLogGpuTimeEnd());
2809: finalizesym:
2810:   c->singlemalloc = PETSC_FALSE;
2811:   c->free_a       = PETSC_TRUE;
2812:   c->free_ij      = PETSC_TRUE;
2813:   PetscCall(PetscMalloc1(m + 1, &c->i));
2814:   PetscCall(PetscMalloc1(c->nz, &c->j));
2815:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2816:     PetscInt      *d_i = c->i;
2817:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2818:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2819:     ii = *Ccsr->row_offsets;
2820:     jj = *Ccsr->column_indices;
2821:     if (ciscompressed) d_i = c->compressedrow.i;
2822:     PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823:     PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824:   } else {
2825:     PetscInt *d_i = c->i;
2826:     if (ciscompressed) d_i = c->compressedrow.i;
2827:     PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2828:     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2829:   }
2830:   if (ciscompressed) { /* need to expand host row offsets */
2831:     PetscInt r = 0;
2832:     c->i[0]    = 0;
2833:     for (k = 0; k < c->compressedrow.nrows; k++) {
2834:       const PetscInt next = c->compressedrow.rindex[k];
2835:       const PetscInt old  = c->compressedrow.i[k];
2836:       for (; r < next; r++) c->i[r + 1] = old;
2837:     }
2838:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2839:   }
2840:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2841:   PetscCall(PetscMalloc1(m, &c->ilen));
2842:   PetscCall(PetscMalloc1(m, &c->imax));
2843:   c->maxnz         = c->nz;
2844:   c->nonzerorowcnt = 0;
2845:   c->rmax          = 0;
2846:   for (k = 0; k < m; k++) {
2847:     const PetscInt nn = c->i[k + 1] - c->i[k];
2848:     c->ilen[k] = c->imax[k] = nn;
2849:     c->nonzerorowcnt += (PetscInt) !!nn;
2850:     c->rmax = PetscMax(c->rmax, nn);
2851:   }
2852:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2853:   PetscCall(PetscMalloc1(c->nz, &c->a));
2854:   Ccsr->num_entries = c->nz;

2856:   C->nonzerostate++;
2857:   PetscCall(PetscLayoutSetUp(C->rmap));
2858:   PetscCall(PetscLayoutSetUp(C->cmap));
2859:   Ccusp->nonzerostate = C->nonzerostate;
2860:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2861:   C->preallocated     = PETSC_TRUE;
2862:   C->assembled        = PETSC_FALSE;
2863:   C->was_assembled    = PETSC_FALSE;
2864:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2865:     mmdata->reusesym = PETSC_TRUE;
2866:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2867:   }
2868:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2869:   PetscFunctionReturn(PETSC_SUCCESS);
2870: }

2872: /* handles sparse or dense B */
2873: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2874: {
2875:   Mat_Product *product = mat->product;
2876:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2878:   PetscFunctionBegin;
2879:   MatCheckProduct(mat, 1);
2880:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2881:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2882:   if (product->type == MATPRODUCT_ABC) {
2883:     Ciscusp = PETSC_FALSE;
2884:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2885:   }
2886:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2887:     PetscBool usecpu = PETSC_FALSE;
2888:     switch (product->type) {
2889:     case MATPRODUCT_AB:
2890:       if (product->api_user) {
2891:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2892:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2893:         PetscOptionsEnd();
2894:       } else {
2895:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2896:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2897:         PetscOptionsEnd();
2898:       }
2899:       break;
2900:     case MATPRODUCT_AtB:
2901:       if (product->api_user) {
2902:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2903:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2904:         PetscOptionsEnd();
2905:       } else {
2906:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2907:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2908:         PetscOptionsEnd();
2909:       }
2910:       break;
2911:     case MATPRODUCT_PtAP:
2912:       if (product->api_user) {
2913:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2914:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2915:         PetscOptionsEnd();
2916:       } else {
2917:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2918:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2919:         PetscOptionsEnd();
2920:       }
2921:       break;
2922:     case MATPRODUCT_RARt:
2923:       if (product->api_user) {
2924:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2925:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2926:         PetscOptionsEnd();
2927:       } else {
2928:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2929:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2930:         PetscOptionsEnd();
2931:       }
2932:       break;
2933:     case MATPRODUCT_ABC:
2934:       if (product->api_user) {
2935:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2936:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2937:         PetscOptionsEnd();
2938:       } else {
2939:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2940:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2941:         PetscOptionsEnd();
2942:       }
2943:       break;
2944:     default:
2945:       break;
2946:     }
2947:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2948:   }
2949:   /* dispatch */
2950:   if (isdense) {
2951:     switch (product->type) {
2952:     case MATPRODUCT_AB:
2953:     case MATPRODUCT_AtB:
2954:     case MATPRODUCT_ABt:
2955:     case MATPRODUCT_PtAP:
2956:     case MATPRODUCT_RARt:
2957:       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2958:       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2959:       break;
2960:     case MATPRODUCT_ABC:
2961:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2962:       break;
2963:     default:
2964:       break;
2965:     }
2966:   } else if (Biscusp && Ciscusp) {
2967:     switch (product->type) {
2968:     case MATPRODUCT_AB:
2969:     case MATPRODUCT_AtB:
2970:     case MATPRODUCT_ABt:
2971:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2972:       break;
2973:     case MATPRODUCT_PtAP:
2974:     case MATPRODUCT_RARt:
2975:     case MATPRODUCT_ABC:
2976:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2977:       break;
2978:     default:
2979:       break;
2980:     }
2981:   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2982:   PetscFunctionReturn(PETSC_SUCCESS);
2983: }

2985: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2986: {
2987:   PetscFunctionBegin;
2988:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2989:   PetscFunctionReturn(PETSC_SUCCESS);
2990: }

2992: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2993: {
2994:   PetscFunctionBegin;
2995:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2996:   PetscFunctionReturn(PETSC_SUCCESS);
2997: }

2999: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3000: {
3001:   PetscFunctionBegin;
3002:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3003:   PetscFunctionReturn(PETSC_SUCCESS);
3004: }

3006: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3007: {
3008:   PetscFunctionBegin;
3009:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3010:   PetscFunctionReturn(PETSC_SUCCESS);
3011: }

3013: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3014: {
3015:   PetscFunctionBegin;
3016:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3017:   PetscFunctionReturn(PETSC_SUCCESS);
3018: }

3020: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3021: {
3022:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3023:   if (i < n) y[idx[i]] += x[i];
3024: }

3026: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3027: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3028: {
3029:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
3030:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3031:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3032:   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
3033:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3034:   PetscBool                      compressed;
3035:   PetscInt                       nx, ny;

3037:   PetscFunctionBegin;
3038:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3039:   if (!a->nz) {
3040:     if (yy) PetscCall(VecSeq_HIP::copy(yy, zz));
3041:     else PetscCall(VecSeq_HIP::set(zz, 0));
3042:     PetscFunctionReturn(PETSC_SUCCESS);
3043:   }
3044:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3045:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3046:   if (!trans) {
3047:     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3048:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3049:   } else {
3050:     if (herm || !A->form_explicit_transpose) {
3051:       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3052:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3053:     } else {
3054:       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3055:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3056:     }
3057:   }
3058:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3059:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3060:   try {
3061:     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3062:     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3063:     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3065:     PetscCall(PetscLogGpuTimeBegin());
3066:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3067:       /* z = A x + beta y.
3068:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3069:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3070:       */
3071:       xptr = xarray;
3072:       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3073:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3074:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3075:           allocated to accommodate different uses. So we get the length info directly from mat.
3076:        */
3077:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3078:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3079:         nx             = mat->num_cols;
3080:         ny             = mat->num_rows;
3081:       }
3082:     } else {
3083:       /* z = A^T x + beta y
3084:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3085:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3086:        */
3087:       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3088:       dptr = zarray;
3089:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3090:       if (compressed) { /* Scatter x to work vector */
3091:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3092:         thrust::for_each(
3093: #if PetscDefined(HAVE_THRUST_ASYNC)
3094:           thrust::hip::par.on(PetscDefaultHipStream),
3095: #endif
3096:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3097:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3098:       }
3099:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3100:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3101:         nx             = mat->num_rows;
3102:         ny             = mat->num_cols;
3103:       }
3104:     }
3105:     /* csr_spmv does y = alpha op(A) x + beta y */
3106:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3107: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3108:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3109:       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3110:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3111:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3112:         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3113:                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3114:         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3115:         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3116:       } else {
3117:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3118:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3119:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3120:       }
3121:       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3122:                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3123: #else
3124:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3125:       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3126: #endif
3127:     } else {
3128:       if (hipsparsestruct->nrows) {
3129:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3130:         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3131:       }
3132:     }
3133:     PetscCall(PetscLogGpuTimeEnd());

3135:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3136:       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
3137:         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3138:           PetscCall(VecSeq_HIP::copy(yy, zz));      /* zz = yy */
3139:         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3140:           PetscCall(VecSeq_HIP::axpy(zz, 1.0, yy)); /* zz += yy */
3141:         }
3142:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3143:         PetscCall(VecSeq_HIP::set(zz, 0));
3144:       }

3146:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3147:       if (compressed) {
3148:         PetscCall(PetscLogGpuTimeBegin());
3149:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3150:            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3151:            prevent that. So I just add a ScatterAdd kernel.
3152:          */
3153: #if 0
3154:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3155:         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3156:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3157:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3158:                          VecHIPPlusEquals());
3159: #else
3160:         PetscInt n = matstruct->cprowIndices->size();
3161:         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3162: #endif
3163:         PetscCall(PetscLogGpuTimeEnd());
3164:       }
3165:     } else {
3166:       if (yy && yy != zz) PetscCall(VecSeq_HIP::axpy(zz, 1.0, yy)); /* zz += yy */
3167:     }
3168:     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3169:     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3170:     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3171:   } catch (char *ex) {
3172:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3173:   }
3174:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3175:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3176:   PetscFunctionReturn(PETSC_SUCCESS);
3177: }

3179: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3180: {
3181:   PetscFunctionBegin;
3182:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3183:   PetscFunctionReturn(PETSC_SUCCESS);
3184: }

3186: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3187: {
3188:   PetscObjectState     onnz = A->nonzerostate;
3189:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3191:   PetscFunctionBegin;
3192:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3193:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3194:     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3195:     PetscCallHIP(hipFree(cusp->deviceMat));
3196:     cusp->deviceMat = NULL;
3197:   }
3198:   PetscFunctionReturn(PETSC_SUCCESS);
3199: }

3201: /*@
3202:    MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3203:    This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3204:    For good matrix assembly performance the user should preallocate the matrix storage by setting
3205:    the parameter `nz` (or the array `nnz`).

3207:    Collective

3209:    Input Parameters:
3210: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3211: .  m - number of rows
3212: .  n - number of columns
3213: .  nz - number of nonzeros per row (same for all rows)
3214: -  nnz - array containing the number of nonzeros in the various rows
3215:          (possibly different for each row) or `NULL`

3217:    Output Parameter:
3218: .  A - the matrix

3220:    Level: intermediate

3222:    Notes:
3223:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3224:    `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3225:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]

3227:    If `nnz` is given then `nz` is ignored

3229:    The AIJ format (compressed row storage), is fully compatible with standard Fortran
3230:    storage.  That is, the stored row and column indices can begin at
3231:    either one (as in Fortran) or zero.

3233:    Specify the preallocated storage with either `nz` or `nnz` (not both).
3234:    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3235:    allocation.

3237:    By default, this format uses inodes (identical nodes) when possible, to
3238:    improve numerical efficiency of matrix-vector products and solves. We
3239:    search for consecutive rows with the same nonzero structure, thereby
3240:    reusing matrix information to achieve increased efficiency.

3242: .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3243: @*/
3244: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3245: {
3246:   PetscFunctionBegin;
3247:   PetscCall(MatCreate(comm, A));
3248:   PetscCall(MatSetSizes(*A, m, n, m, n));
3249:   PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3250:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3251:   PetscFunctionReturn(PETSC_SUCCESS);
3252: }

3254: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3255: {
3256:   PetscFunctionBegin;
3257:   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy((Mat_SeqAIJHIPSPARSE **)&A->spptr));
3258:   else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3259:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3260:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3261:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3262:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3263:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3264:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3265:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3266:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3267:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3268:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3269:   PetscCall(MatDestroy_SeqAIJ(A));
3270:   PetscFunctionReturn(PETSC_SUCCESS);
3271: }

3273: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3274: {
3275:   PetscFunctionBegin;
3276:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3277:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3278:   PetscFunctionReturn(PETSC_SUCCESS);
3279: }

3281: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3282: {
3283:   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3284:   Mat_SeqAIJHIPSPARSE *cy;
3285:   Mat_SeqAIJHIPSPARSE *cx;
3286:   PetscScalar         *ay;
3287:   const PetscScalar   *ax;
3288:   CsrMatrix           *csry, *csrx;

3290:   PetscFunctionBegin;
3291:   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3292:   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3293:   if (X->ops->axpy != Y->ops->axpy) {
3294:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3295:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3296:     PetscFunctionReturn(PETSC_SUCCESS);
3297:   }
3298:   /* if we are here, it means both matrices are bound to GPU */
3299:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3300:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3301:   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3302:   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3303:   csry = (CsrMatrix *)cy->mat->mat;
3304:   csrx = (CsrMatrix *)cx->mat->mat;
3305:   /* see if we can turn this into a hipblas axpy */
3306:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3307:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3308:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3309:     if (eq) str = SAME_NONZERO_PATTERN;
3310:   }
3311:   /* spgeam is buggy with one column */
3312:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3313:   if (str == SUBSET_NONZERO_PATTERN) {
3314:     PetscScalar b = 1.0;
3315: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3316:     size_t bufferSize;
3317:     void  *buffer;
3318: #endif

3320:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3321:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3322:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3323: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3324:     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3325:                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3326:     PetscCallHIP(hipMalloc(&buffer, bufferSize));
3327:     PetscCall(PetscLogGpuTimeBegin());
3328:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3329:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3330:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3331:     PetscCall(PetscLogGpuTimeEnd());
3332:     PetscCallHIP(hipFree(buffer));
3333: #else
3334:     PetscCall(PetscLogGpuTimeBegin());
3335:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3336:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3337:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3338:     PetscCall(PetscLogGpuTimeEnd());
3339: #endif
3340:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3341:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3342:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3343:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3344:   } else if (str == SAME_NONZERO_PATTERN) {
3345:     hipblasHandle_t hipblasv2handle;
3346:     PetscBLASInt    one = 1, bnz = 1;

3348:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3349:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3350:     PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3351:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3352:     PetscCall(PetscLogGpuTimeBegin());
3353:     PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3354:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3355:     PetscCall(PetscLogGpuTimeEnd());
3356:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3357:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3358:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3359:   } else {
3360:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3361:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3362:   }
3363:   PetscFunctionReturn(PETSC_SUCCESS);
3364: }

3366: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3367: {
3368:   Mat_SeqAIJ     *y = (Mat_SeqAIJ *)Y->data;
3369:   PetscScalar    *ay;
3370:   hipblasHandle_t hipblasv2handle;
3371:   PetscBLASInt    one = 1, bnz = 1;

3373:   PetscFunctionBegin;
3374:   PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3375:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3376:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3377:   PetscCall(PetscLogGpuTimeBegin());
3378:   PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3379:   PetscCall(PetscLogGpuFlops(bnz));
3380:   PetscCall(PetscLogGpuTimeEnd());
3381:   PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3382:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3383:   PetscFunctionReturn(PETSC_SUCCESS);
3384: }

3386: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3387: {
3388:   PetscBool   both = PETSC_FALSE;
3389:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3391:   PetscFunctionBegin;
3392:   if (A->factortype == MAT_FACTOR_NONE) {
3393:     Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3394:     if (spptr->mat) {
3395:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3396:       if (matrix->values) {
3397:         both = PETSC_TRUE;
3398:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3399:       }
3400:     }
3401:     if (spptr->matTranspose) {
3402:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3403:       if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3404:     }
3405:   }
3406:   //PetscCall(MatZeroEntries_SeqAIJ(A));
3407:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3408:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3409:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3410:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3411:   PetscFunctionReturn(PETSC_SUCCESS);
3412: }

3414: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3415: {
3416:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3418:   PetscFunctionBegin;
3419:   if (A->factortype != MAT_FACTOR_NONE) {
3420:     A->boundtocpu = flg;
3421:     PetscFunctionReturn(PETSC_SUCCESS);
3422:   }
3423:   if (flg) {
3424:     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));

3426:     A->ops->scale                     = MatScale_SeqAIJ;
3427:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3428:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3429:     A->ops->mult                      = MatMult_SeqAIJ;
3430:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3431:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3432:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3433:     A->ops->multhermitiantranspose    = NULL;
3434:     A->ops->multhermitiantransposeadd = NULL;
3435:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3436:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3437:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3438:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3439:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3440:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3441:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3442:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3443:   } else {
3444:     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3445:     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3446:     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3447:     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3448:     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3449:     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3450:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3451:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3452:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3453:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3454:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3455:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3456:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3457:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3458:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3459:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3460:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3461:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3462:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3463:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3464:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3465:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3466:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3467:   }
3468:   A->boundtocpu = flg;
3469:   if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3470:   else a->inode.use = PETSC_FALSE;

3472:   PetscFunctionReturn(PETSC_SUCCESS);
3473: }

3475: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3476: {
3477:   Mat B;

3479:   PetscFunctionBegin;
3480:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3481:   if (reuse == MAT_INITIAL_MATRIX) {
3482:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3483:   } else if (reuse == MAT_REUSE_MATRIX) {
3484:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3485:   }
3486:   B = *newmat;
3487:   PetscCall(PetscFree(B->defaultvectype));
3488:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3489:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3490:     if (B->factortype == MAT_FACTOR_NONE) {
3491:       Mat_SeqAIJHIPSPARSE *spptr;
3492:       PetscCall(PetscNew(&spptr));
3493:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3494:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3495:       spptr->format = MAT_HIPSPARSE_CSR;
3496: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3497:       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3498: #else
3499:       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3500: #endif
3501:       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3502:       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;

3504:       B->spptr = spptr;
3505:     } else {
3506:       Mat_SeqAIJHIPSPARSETriFactors *spptr;

3508:       PetscCall(PetscNew(&spptr));
3509:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3510:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3511:       B->spptr = spptr;
3512:     }
3513:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3514:   }
3515:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJHIPSPARSE;
3516:   B->ops->destroy        = MatDestroy_SeqAIJHIPSPARSE;
3517:   B->ops->setoption      = MatSetOption_SeqAIJHIPSPARSE;
3518:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3519:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJHIPSPARSE;
3520:   B->ops->duplicate      = MatDuplicate_SeqAIJHIPSPARSE;

3522:   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3523:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3524:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3525: #if defined(PETSC_HAVE_HYPRE)
3526:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3527: #endif
3528:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3529:   PetscFunctionReturn(PETSC_SUCCESS);
3530: }

3532: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3533: {
3534:   PetscFunctionBegin;
3535:   PetscCall(MatCreate_SeqAIJ(B));
3536:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3537:   PetscFunctionReturn(PETSC_SUCCESS);
3538: }

3540: /*
3541:    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs

3543:    A matrix type type whose data resides on AMD GPUs. These matrices can be in either
3544:    CSR, ELL, or Hybrid format.
3545:    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.

3547:    Options Database Keys:
3548: +  -mat_type aijhipsparse - sets the matrix type to "seqaijhipsparse" during a call to MatSetFromOptions()
3549: .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3550:                                        Other options include ell (ellpack) or hyb (hybrid).
3551: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3552: +  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU

3554:   Level: beginner

3556: .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3557: */
3558: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3559: {
3560:   PetscFunctionBegin;
3561:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse_band));
3562:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3563:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3564:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3565:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));

3567:   PetscFunctionReturn(PETSC_SUCCESS);
3568: }

3570: static PetscErrorCode MatResetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat)
3571: {
3572:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)mat->spptr;

3574:   PetscFunctionBegin;
3575:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3576:   delete cusp->cooPerm;
3577:   delete cusp->cooPerm_a;
3578:   cusp->cooPerm   = NULL;
3579:   cusp->cooPerm_a = NULL;
3580:   if (cusp->use_extended_coo) {
3581:     PetscCallHIP(hipFree(cusp->jmap_d));
3582:     PetscCallHIP(hipFree(cusp->perm_d));
3583:   }
3584:   cusp->use_extended_coo = PETSC_FALSE;
3585:   PetscFunctionReturn(PETSC_SUCCESS);
3586: }

3588: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **hipsparsestruct)
3589: {
3590:   PetscFunctionBegin;
3591:   if (*hipsparsestruct) {
3592:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->mat, (*hipsparsestruct)->format));
3593:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->matTranspose, (*hipsparsestruct)->format));
3594:     delete (*hipsparsestruct)->workVector;
3595:     delete (*hipsparsestruct)->rowoffsets_gpu;
3596:     delete (*hipsparsestruct)->cooPerm;
3597:     delete (*hipsparsestruct)->cooPerm_a;
3598:     delete (*hipsparsestruct)->csr2csc_i;
3599:     if ((*hipsparsestruct)->handle) PetscCallHIPSPARSE(hipsparseDestroy((*hipsparsestruct)->handle));
3600:     if ((*hipsparsestruct)->jmap_d) PetscCallHIP(hipFree((*hipsparsestruct)->jmap_d));
3601:     if ((*hipsparsestruct)->perm_d) PetscCallHIP(hipFree((*hipsparsestruct)->perm_d));
3602:     PetscCall(PetscFree(*hipsparsestruct));
3603:   }
3604:   PetscFunctionReturn(PETSC_SUCCESS);
3605: }

3607: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3608: {
3609:   PetscFunctionBegin;
3610:   if (*mat) {
3611:     delete (*mat)->values;
3612:     delete (*mat)->column_indices;
3613:     delete (*mat)->row_offsets;
3614:     delete *mat;
3615:     *mat = 0;
3616:   }
3617:   PetscFunctionReturn(PETSC_SUCCESS);
3618: }

3620: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3621: {
3622:   PetscFunctionBegin;
3623:   if (*trifactor) {
3624:     if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3625:     if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3626:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3627:     if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3628:     if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3629:     if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3630:     PetscCall(PetscFree(*trifactor));
3631:   }
3632:   PetscFunctionReturn(PETSC_SUCCESS);
3633: }

3635: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3636: {
3637:   CsrMatrix *mat;

3639:   PetscFunctionBegin;
3640:   if (*matstruct) {
3641:     if ((*matstruct)->mat) {
3642:       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3643:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3644:         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3645:       } else {
3646:         mat = (CsrMatrix *)(*matstruct)->mat;
3647:         PetscCall(CsrMatrix_Destroy(&mat));
3648:       }
3649:     }
3650:     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3651:     delete (*matstruct)->cprowIndices;
3652:     if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3653:     if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3654:     if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));

3656:     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3657:     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3658:     for (int i = 0; i < 3; i++) {
3659:       if (mdata->hipSpMV[i].initialized) {
3660:         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3661:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3662:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3663:       }
3664:     }
3665:     delete *matstruct;
3666:     *matstruct = NULL;
3667:   }
3668:   PetscFunctionReturn(PETSC_SUCCESS);
3669: }

3671: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3672: {
3673:   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;

3675:   PetscFunctionBegin;
3676:   if (fs) {
3677:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3678:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3679:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3680:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3681:     delete fs->rpermIndices;
3682:     delete fs->cpermIndices;
3683:     delete fs->workVector;
3684:     fs->rpermIndices = NULL;
3685:     fs->cpermIndices = NULL;
3686:     fs->workVector   = NULL;
3687:     if (fs->a_band_d) PetscCallHIP(hipFree(fs->a_band_d));
3688:     if (fs->i_band_d) PetscCallHIP(hipFree(fs->i_band_d));
3689:     fs->init_dev_prop = PETSC_FALSE;
3690: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3691:     PetscCallHIP(hipFree(fs->csrRowPtr));
3692:     PetscCallHIP(hipFree(fs->csrColIdx));
3693:     PetscCallHIP(hipFree(fs->csrVal));
3694:     PetscCallHIP(hipFree(fs->X));
3695:     PetscCallHIP(hipFree(fs->Y));
3696:     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3697:     PetscCallHIP(hipFree(fs->spsvBuffer_L));
3698:     PetscCallHIP(hipFree(fs->spsvBuffer_U));
3699:     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3700:     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3701:     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3702:     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3703:     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3704:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3705:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3706:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3707:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3708:     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3709:     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3710:     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3711:     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));

3713:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3714:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3715: #endif
3716:   }
3717:   PetscFunctionReturn(PETSC_SUCCESS);
3718: }

3720: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3721: {
3722:   hipsparseHandle_t handle;

3724:   PetscFunctionBegin;
3725:   if (*trifactors) {
3726:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3727:     if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3728:     PetscCall(PetscFree(*trifactors));
3729:   }
3730:   PetscFunctionReturn(PETSC_SUCCESS);
3731: }

3733: struct IJCompare {
3734:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3735:   {
3736:     if (t1.get<0>() < t2.get<0>()) return true;
3737:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3738:     return false;
3739:   }
3740: };

3742: struct IJEqual {
3743:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3744:   {
3745:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3746:     return true;
3747:   }
3748: };

3750: struct IJDiff {
3751:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3752: };

3754: struct IJSum {
3755:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3756: };

3758: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3759: {
3760:   Mat_SeqAIJHIPSPARSE                  *cusp      = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3761:   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3762:   THRUSTARRAY                          *cooPerm_v = NULL;
3763:   thrust::device_ptr<const PetscScalar> d_v;
3764:   CsrMatrix                            *matrix;
3765:   PetscInt                              n;

3767:   PetscFunctionBegin;
3768:   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIPSPARSE struct");
3769:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIPSPARSE CsrMatrix");
3770:   if (!cusp->cooPerm) {
3771:     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
3772:     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
3773:     PetscFunctionReturn(PETSC_SUCCESS);
3774:   }
3775:   matrix = (CsrMatrix *)cusp->mat->mat;
3776:   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3777:   if (!v) {
3778:     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3779:     goto finalize;
3780:   }
3781:   n = cusp->cooPerm->size();
3782:   if (isHipMem(v)) d_v = thrust::device_pointer_cast(v);
3783:   else {
3784:     cooPerm_v = new THRUSTARRAY(n);
3785:     cooPerm_v->assign(v, v + n);
3786:     d_v = cooPerm_v->data();
3787:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
3788:   }
3789:   PetscCall(PetscLogGpuTimeBegin());
3790:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3791:     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
3792:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3793:       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3794:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3795:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3796:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3797:       */
3798:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3799:       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3800:       delete cooPerm_w;
3801:     } else {
3802:       /* all nonzeros in d_v[] are unique entries */
3803:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3804:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3805:       thrust::for_each(zibit, zieit, VecHIPPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3806:     }
3807:   } else {
3808:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3809:       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3810:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3811:     } else {
3812:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3813:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3814:       thrust::for_each(zibit, zieit, VecHIPEquals());
3815:     }
3816:   }
3817:   PetscCall(PetscLogGpuTimeEnd());
3818: finalize:
3819:   delete cooPerm_v;
3820:   A->offloadmask = PETSC_OFFLOAD_GPU;
3821:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3822:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3823:   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
3824:   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
3825:   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
3826:   a->reallocs = 0;
3827:   A->info.mallocs += 0;
3828:   A->info.nz_unneeded = 0;
3829:   A->assembled = A->was_assembled = PETSC_TRUE;
3830:   A->num_ass++;
3831:   PetscFunctionReturn(PETSC_SUCCESS);
3832: }

3834: PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3835: {
3836:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3838:   PetscFunctionBegin;
3839:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3840:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3841:   if (destroy) {
3842:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3843:     delete cusp->csr2csc_i;
3844:     cusp->csr2csc_i = NULL;
3845:   }
3846:   A->transupdated = PETSC_FALSE;
3847:   PetscFunctionReturn(PETSC_SUCCESS);
3848: }

3850: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
3851: {
3852:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3853:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
3854:   PetscInt             cooPerm_n, nzr = 0;

3856:   PetscFunctionBegin;
3857:   PetscCall(PetscLayoutSetUp(A->rmap));
3858:   PetscCall(PetscLayoutSetUp(A->cmap));
3859:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3860:   if (n != cooPerm_n) {
3861:     delete cusp->cooPerm;
3862:     delete cusp->cooPerm_a;
3863:     cusp->cooPerm   = NULL;
3864:     cusp->cooPerm_a = NULL;
3865:   }
3866:   if (n) {
3867:     thrust::device_ptr<PetscInt> d_i, d_j;
3868:     PetscInt                    *d_raw_i, *d_raw_j;
3869:     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
3870:     PetscMemType                 imtype, jmtype;

3872:     PetscCall(PetscGetMemType(coo_i, &imtype));
3873:     if (PetscMemTypeHost(imtype)) {
3874:       PetscCallHIP(hipMalloc(&d_raw_i, sizeof(PetscInt) * n));
3875:       PetscCallHIP(hipMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, hipMemcpyHostToDevice));
3876:       d_i        = thrust::device_pointer_cast(d_raw_i);
3877:       free_raw_i = PETSC_TRUE;
3878:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
3879:     } else {
3880:       d_i = thrust::device_pointer_cast(coo_i);
3881:     }

3883:     PetscCall(PetscGetMemType(coo_j, &jmtype));
3884:     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic() passes device coo_i[] and host coo_j[]!
3885:       PetscCallHIP(hipMalloc(&d_raw_j, sizeof(PetscInt) * n));
3886:       PetscCallHIP(hipMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, hipMemcpyHostToDevice));
3887:       d_j        = thrust::device_pointer_cast(d_raw_j);
3888:       free_raw_j = PETSC_TRUE;
3889:       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
3890:     } else {
3891:       d_j = thrust::device_pointer_cast(coo_j);
3892:     }

3894:     THRUSTINTARRAY ii(A->rmap->n);

3896:     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
3897:     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
3898:     /* Ex.
3899:       n = 6
3900:       coo_i = [3,3,1,4,1,4]
3901:       coo_j = [3,2,2,5,2,6]
3902:     */
3903:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
3904:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));

3906:     PetscCall(PetscLogGpuTimeBegin());
3907:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3908:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3909:     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
3910:     THRUSTINTARRAY w(d_j, d_j + n);
3911:     /*
3912:       d_i     = [1,1,3,3,4,4]
3913:       d_j     = [2,2,2,3,5,6]
3914:       cooPerm = [2,4,1,0,3,5]
3915:     */
3916:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3917:     /*
3918:       d_i     = [1,3,3,4,4,x]
3919:                             ^ekey
3920:       d_j     = [2,2,3,5,6,x]
3921:                            ^nekye
3922:     */
3923:     if (nekey == ekey) { /* all entries are unique */
3924:       delete cusp->cooPerm_a;
3925:       cusp->cooPerm_a = NULL;
3926:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3927:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3928:       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3929:       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3930:       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a hipMemcpy */
3931:       w[0]                  = 0;
3932:       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
3933:       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3934:     }
3935:     thrust::counting_iterator<PetscInt> search_begin(0);
3936:     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3937:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3938:                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3939:     PetscCall(PetscLogGpuTimeEnd());

3941:     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
3942:     a->singlemalloc = PETSC_FALSE;
3943:     a->free_a       = PETSC_TRUE;
3944:     a->free_ij      = PETSC_TRUE;
3945:     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
3946:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3947:     PetscCallHIP(hipMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3948:     a->nz = a->maxnz = a->i[A->rmap->n];
3949:     a->rmax          = 0;
3950:     PetscCall(PetscMalloc1(a->nz, &a->a));
3951:     PetscCall(PetscMalloc1(a->nz, &a->j));
3952:     PetscCallHIP(hipMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), hipMemcpyDeviceToHost));
3953:     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
3954:     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
3955:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3956:       const PetscInt nnzr = a->i[i + 1] - a->i[i];
3957:       nzr += (PetscInt) !!(nnzr);
3958:       a->ilen[i] = a->imax[i] = nnzr;
3959:       a->rmax                 = PetscMax(a->rmax, nnzr);
3960:     }
3961:     a->nonzerorowcnt = nzr;
3962:     A->preallocated  = PETSC_TRUE;
3963:     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
3964:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3965:     if (free_raw_i) PetscCallHIP(hipFree(d_raw_i));
3966:     if (free_raw_j) PetscCallHIP(hipFree(d_raw_j));
3967:   } else PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
3968:   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3969:   /* We want to allocate the HIPSPARSE struct for matvec now.
3970:      The code is so convoluted now that I prefer to copy zeros */
3971:   PetscCall(PetscArrayzero(a->a, a->nz));
3972:   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
3973:   A->offloadmask = PETSC_OFFLOAD_CPU;
3974:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3975:   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
3976:   PetscFunctionReturn(PETSC_SUCCESS);
3977: }

3979: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3980: {
3981:   Mat_SeqAIJ          *seq;
3982:   Mat_SeqAIJHIPSPARSE *dev;
3983:   PetscBool            coo_basic = PETSC_TRUE;
3984:   PetscMemType         mtype     = PETSC_MEMTYPE_DEVICE;

3986:   PetscFunctionBegin;
3987:   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
3988:   PetscCall(MatResetPreallocationCOO_SeqAIJHIPSPARSE(mat));
3989:   if (coo_i) {
3990:     PetscCall(PetscGetMemType(coo_i, &mtype));
3991:     if (PetscMemTypeHost(mtype)) {
3992:       for (PetscCount k = 0; k < coo_n; k++) {
3993:         if (coo_i[k] < 0 || coo_j[k] < 0) {
3994:           coo_basic = PETSC_FALSE;
3995:           break;
3996:         }
3997:       }
3998:     }
3999:   }

4001:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4002:     PetscCall(MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4003:   } else {
4004:     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4005:     mat->offloadmask = PETSC_OFFLOAD_CPU;
4006:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
4007:     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4008:     dev = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
4009:     PetscCallHIP(hipMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4010:     PetscCallHIP(hipMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
4011:     PetscCallHIP(hipMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4012:     PetscCallHIP(hipMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
4013:     dev->use_extended_coo = PETSC_TRUE;
4014:   }
4015:   PetscFunctionReturn(PETSC_SUCCESS);
4016: }

4018: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4019: {
4020:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4021:   const PetscCount grid_size = gridDim.x * blockDim.x;
4022:   for (; i < nnz; i += grid_size) {
4023:     PetscScalar sum = 0.0;
4024:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4025:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4026:   }
4027: }

4029: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4030: {
4031:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4032:   Mat_SeqAIJHIPSPARSE *dev  = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4033:   PetscCount           Annz = seq->nz;
4034:   PetscMemType         memtype;
4035:   const PetscScalar   *v1 = v;
4036:   PetscScalar         *Aa;

4038:   PetscFunctionBegin;
4039:   if (dev->use_extended_coo) {
4040:     PetscCall(PetscGetMemType(v, &memtype));
4041:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4042:       PetscCallHIP(hipMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4043:       PetscCallHIP(hipMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice));
4044:     }

4046:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
4047:     else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));

4049:     if (Annz) {
4050:       hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4051:       PetscCallHIP(hipPeekAtLastError());
4052:     }

4054:     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
4055:     else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));

4057:     if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
4058:   } else {
4059:     PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(A, v, imode));
4060:   }
4061:   PetscFunctionReturn(PETSC_SUCCESS);
4062: }

4064: /*@C
4065:     MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.

4067:     Not Collective

4069:     Input Parameters:
4070: +   A - the matrix
4071: -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4073:     Output Parameters:
4074: +   ia - the CSR row pointers
4075: -   ja - the CSR column indices

4077:     Level: developer

4079:     Note:
4080:       When compressed is true, the CSR structure does not contain empty rows

4082: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4083: @*/
4084: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4085: {
4086:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4087:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
4088:   CsrMatrix           *csr;

4090:   PetscFunctionBegin;
4092:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4093:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4094:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4095:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4096:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4097:   csr = (CsrMatrix *)cusp->mat->mat;
4098:   if (i) {
4099:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4100:       if (!cusp->rowoffsets_gpu) {
4101:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4102:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4103:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4104:       }
4105:       *i = cusp->rowoffsets_gpu->data().get();
4106:     } else *i = csr->row_offsets->data().get();
4107:   }
4108:   if (j) *j = csr->column_indices->data().get();
4109:   PetscFunctionReturn(PETSC_SUCCESS);
4110: }

4112: /*@C
4113:     MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`

4115:     Not Collective

4117:     Input Parameters:
4118: +   A - the matrix
4119: .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4120: .   ia - the CSR row pointers
4121: -   ja - the CSR column indices

4123:     Level: developer

4125: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
4126: @*/
4127: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4128: {
4129:   PetscFunctionBegin;
4131:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4132:   if (i) *i = NULL;
4133:   if (j) *j = NULL;
4134:   PetscFunctionReturn(PETSC_SUCCESS);
4135: }

4137: /*@C
4138:    MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

4140:    Not Collective

4142:    Input Parameter:
4143: .   A - a `MATSEQAIJHIPSPARSE` matrix

4145:    Output Parameter:
4146: .   a - pointer to the device data

4148:    Level: developer

4150:    Note:
4151:    May trigger host-device copies if the up-to-date matrix data is on host

4153: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
4154: @*/
4155: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4156: {
4157:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4158:   CsrMatrix           *csr;

4160:   PetscFunctionBegin;
4163:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4164:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4165:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4166:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4167:   csr = (CsrMatrix *)cusp->mat->mat;
4168:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4169:   *a = csr->values->data().get();
4170:   PetscFunctionReturn(PETSC_SUCCESS);
4171: }

4173: /*@C
4174:    MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`

4176:    Not Collective

4178:    Input Parameters:
4179: +   A - a `MATSEQAIJHIPSPARSE` matrix
4180: -   a - pointer to the device data

4182:    Level: developer

4184: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4185: @*/
4186: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4187: {
4188:   PetscFunctionBegin;
4191:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4192:   *a = NULL;
4193:   PetscFunctionReturn(PETSC_SUCCESS);
4194: }

4196: /*@C
4197:    MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

4199:    Not Collective

4201:    Input Parameter:
4202: .   A - a `MATSEQAIJHIPSPARSE` matrix

4204:    Output Parameter:
4205: .   a - pointer to the device data

4207:    Level: developer

4209:    Note:
4210:    May trigger host-device copies if up-to-date matrix data is on host

4212: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
4213: @*/
4214: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
4215: {
4216:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4217:   CsrMatrix           *csr;

4219:   PetscFunctionBegin;
4222:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4223:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4224:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4225:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4226:   csr = (CsrMatrix *)cusp->mat->mat;
4227:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4228:   *a             = csr->values->data().get();
4229:   A->offloadmask = PETSC_OFFLOAD_GPU;
4230:   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4231:   PetscFunctionReturn(PETSC_SUCCESS);
4232: }
4233: /*@C
4234:    MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`

4236:    Not Collective

4238:    Input Parameters:
4239: +   A - a `MATSEQAIJHIPSPARSE` matrix
4240: -   a - pointer to the device data

4242:    Level: developer

4244: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4245: @*/
4246: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4247: {
4248:   PetscFunctionBegin;
4251:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4252:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4253:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4254:   *a = NULL;
4255:   PetscFunctionReturn(PETSC_SUCCESS);
4256: }

4258: /*@C
4259:    MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

4261:    Not Collective

4263:    Input Parameter:
4264: .   A - a `MATSEQAIJHIPSPARSE` matrix

4266:    Output Parameter:
4267: .   a - pointer to the device data

4269:    Level: developer

4271:    Note:
4272:    Does not trigger host-device copies and flags data validity on the GPU

4274: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4275: @*/
4276: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4277: {
4278:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4279:   CsrMatrix           *csr;

4281:   PetscFunctionBegin;
4284:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4285:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4286:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4287:   csr = (CsrMatrix *)cusp->mat->mat;
4288:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4289:   *a             = csr->values->data().get();
4290:   A->offloadmask = PETSC_OFFLOAD_GPU;
4291:   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4292:   PetscFunctionReturn(PETSC_SUCCESS);
4293: }

4295: /*@C
4296:    MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`

4298:    Not Collective

4300:    Input Parameters:
4301: +   A - a `MATSEQAIJHIPSPARSE` matrix
4302: -   a - pointer to the device data

4304:    Level: developer

4306: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4307: @*/
4308: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4309: {
4310:   PetscFunctionBegin;
4313:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4314:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4315:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4316:   *a = NULL;
4317:   PetscFunctionReturn(PETSC_SUCCESS);
4318: }

4320: struct IJCompare4 {
4321:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4322:   {
4323:     if (t1.get<0>() < t2.get<0>()) return true;
4324:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4325:     return false;
4326:   }
4327: };

4329: struct Shift {
4330:   int _shift;

4332:   Shift(int shift) : _shift(shift) { }
4333:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4334: };

4336: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4337: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4338: {
4339:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4340:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4341:   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4342:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
4343:   PetscInt                       Annz, Bnnz;
4344:   PetscInt                       i, m, n, zero = 0;

4346:   PetscFunctionBegin;
4350:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4351:   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4352:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4353:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4354:   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4355:   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4356:   if (reuse == MAT_INITIAL_MATRIX) {
4357:     m = A->rmap->n;
4358:     n = A->cmap->n + B->cmap->n;
4359:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4360:     PetscCall(MatSetSizes(*C, m, n, m, n));
4361:     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4362:     c                       = (Mat_SeqAIJ *)(*C)->data;
4363:     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4364:     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
4365:     Ccsr                    = new CsrMatrix;
4366:     Cmat->cprowIndices      = NULL;
4367:     c->compressedrow.use    = PETSC_FALSE;
4368:     c->compressedrow.nrows  = 0;
4369:     c->compressedrow.i      = NULL;
4370:     c->compressedrow.rindex = NULL;
4371:     Ccusp->workVector       = NULL;
4372:     Ccusp->nrows            = m;
4373:     Ccusp->mat              = Cmat;
4374:     Ccusp->mat->mat         = Ccsr;
4375:     Ccsr->num_rows          = m;
4376:     Ccsr->num_cols          = n;
4377:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4378:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4379:     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4380:     PetscCallHIP(hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4381:     PetscCallHIP(hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4382:     PetscCallHIP(hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4383:     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4384:     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4385:     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4386:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4387:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4388:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4389:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");

4391:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4392:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4393:     Annz                 = (PetscInt)Acsr->column_indices->size();
4394:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4395:     c->nz                = Annz + Bnnz;
4396:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4397:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4398:     Ccsr->values         = new THRUSTARRAY(c->nz);
4399:     Ccsr->num_entries    = c->nz;
4400:     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4401:     if (c->nz) {
4402:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4403:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4404:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4405:       THRUSTINTARRAY32 *Aroff, *Broff;

4407:       if (a->compressedrow.use) { /* need full row offset */
4408:         if (!Acusp->rowoffsets_gpu) {
4409:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4410:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4411:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4412:         }
4413:         Aroff = Acusp->rowoffsets_gpu;
4414:       } else Aroff = Acsr->row_offsets;
4415:       if (b->compressedrow.use) { /* need full row offset */
4416:         if (!Bcusp->rowoffsets_gpu) {
4417:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4418:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4419:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4420:         }
4421:         Broff = Bcusp->rowoffsets_gpu;
4422:       } else Broff = Bcsr->row_offsets;
4423:       PetscCall(PetscLogGpuTimeBegin());
4424:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4425:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4426:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4427:       auto Aperm = thrust::make_constant_iterator(1);
4428:       auto Bperm = thrust::make_constant_iterator(0);
4429:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4430:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4431:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4432:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4433:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4434:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4435:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4436:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4437:       auto p1    = Ccusp->cooPerm->begin();
4438:       auto p2    = Ccusp->cooPerm->begin();
4439:       thrust::advance(p2, Annz);
4440:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4441:       auto cci = thrust::make_counting_iterator(zero);
4442:       auto cce = thrust::make_counting_iterator(c->nz);
4443: #if 0 //Errors on SUMMIT cuda 11.1.0
4444:       PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4445: #else
4446:       auto pred = thrust::identity<int>();
4447:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4448:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4449: #endif
4450:       PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4451:       PetscCall(PetscLogGpuTimeEnd());
4452:       delete wPerm;
4453:       delete Acoo;
4454:       delete Bcoo;
4455:       delete Ccoo;
4456:       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));

4458:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4459:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4460:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4461:         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4462:         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4463:         CsrMatrix                     *CcsrT = new CsrMatrix;
4464:         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4465:         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4467:         (*C)->form_explicit_transpose = PETSC_TRUE;
4468:         (*C)->transupdated            = PETSC_TRUE;
4469:         Ccusp->rowoffsets_gpu         = NULL;
4470:         CmatT->cprowIndices           = NULL;
4471:         CmatT->mat                    = CcsrT;
4472:         CcsrT->num_rows               = n;
4473:         CcsrT->num_cols               = m;
4474:         CcsrT->num_entries            = c->nz;
4475:         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4476:         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4477:         CcsrT->values                 = new THRUSTARRAY(c->nz);

4479:         PetscCall(PetscLogGpuTimeBegin());
4480:         auto rT = CcsrT->row_offsets->begin();
4481:         if (AT) {
4482:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4483:           thrust::advance(rT, -1);
4484:         }
4485:         if (BT) {
4486:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4487:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4488:           thrust::copy(titb, tite, rT);
4489:         }
4490:         auto cT = CcsrT->column_indices->begin();
4491:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4492:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4493:         auto vT = CcsrT->values->begin();
4494:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4495:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4496:         PetscCall(PetscLogGpuTimeEnd());

4498:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4499:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4500:         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4501:         PetscCallHIP(hipMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4502:         PetscCallHIP(hipMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4503:         PetscCallHIP(hipMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4504:         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4505:         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4506:         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

4508:         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4509:         Ccusp->matTranspose = CmatT;
4510:       }
4511:     }

4513:     c->singlemalloc = PETSC_FALSE;
4514:     c->free_a       = PETSC_TRUE;
4515:     c->free_ij      = PETSC_TRUE;
4516:     PetscCall(PetscMalloc1(m + 1, &c->i));
4517:     PetscCall(PetscMalloc1(c->nz, &c->j));
4518:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4519:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4520:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4521:       ii = *Ccsr->row_offsets;
4522:       jj = *Ccsr->column_indices;
4523:       PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4524:       PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4525:     } else {
4526:       PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4527:       PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4528:     }
4529:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4530:     PetscCall(PetscMalloc1(m, &c->ilen));
4531:     PetscCall(PetscMalloc1(m, &c->imax));
4532:     c->maxnz         = c->nz;
4533:     c->nonzerorowcnt = 0;
4534:     c->rmax          = 0;
4535:     for (i = 0; i < m; i++) {
4536:       const PetscInt nn = c->i[i + 1] - c->i[i];
4537:       c->ilen[i] = c->imax[i] = nn;
4538:       c->nonzerorowcnt += (PetscInt) !!nn;
4539:       c->rmax = PetscMax(c->rmax, nn);
4540:     }
4541:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4542:     PetscCall(PetscMalloc1(c->nz, &c->a));
4543:     (*C)->nonzerostate++;
4544:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4545:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4546:     Ccusp->nonzerostate = (*C)->nonzerostate;
4547:     (*C)->preallocated  = PETSC_TRUE;
4548:   } else {
4549:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4550:     c = (Mat_SeqAIJ *)(*C)->data;
4551:     if (c->nz) {
4552:       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4553:       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4554:       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4555:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4556:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4557:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4558:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4559:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4560:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4561:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4562:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4563:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4564:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4565:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4566:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4567:       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4568:       auto pmid = Ccusp->cooPerm->begin();
4569:       thrust::advance(pmid, Acsr->num_entries);
4570:       PetscCall(PetscLogGpuTimeBegin());
4571:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4572:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4573:       thrust::for_each(zibait, zieait, VecHIPEquals());
4574:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4575:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4576:       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4577:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4578:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4579:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4580:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4581:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4582:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4583:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4584:         auto       vT    = CcsrT->values->begin();
4585:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4586:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4587:         (*C)->transupdated = PETSC_TRUE;
4588:       }
4589:       PetscCall(PetscLogGpuTimeEnd());
4590:     }
4591:   }
4592:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4593:   (*C)->assembled     = PETSC_TRUE;
4594:   (*C)->was_assembled = PETSC_FALSE;
4595:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4596:   PetscFunctionReturn(PETSC_SUCCESS);
4597: }

4599: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4600: {
4601:   bool               dmem;
4602:   const PetscScalar *av;

4604:   PetscFunctionBegin;
4605:   dmem = isHipMem(v);
4606:   PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4607:   if (n && idx) {
4608:     THRUSTINTARRAY widx(n);
4609:     widx.assign(idx, idx + n);
4610:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

4612:     THRUSTARRAY                    *w = NULL;
4613:     thrust::device_ptr<PetscScalar> dv;
4614:     if (dmem) dv = thrust::device_pointer_cast(v);
4615:     else {
4616:       w  = new THRUSTARRAY(n);
4617:       dv = w->data();
4618:     }
4619:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4621:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4622:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4623:     thrust::for_each(zibit, zieit, VecHIPEquals());
4624:     if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4625:     delete w;
4626:   } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));

4628:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4629:   PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4630:   PetscFunctionReturn(PETSC_SUCCESS);
4631: }