Actual source code: aijhipsparse.hip.cpp
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #include <thrust/iterator/transform_iterator.h>
16: #if PETSC_CPP_VERSION >= 14
17: #define PETSC_HAVE_THRUST_ASYNC 1
18: #include <thrust/async/for_each.h>
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/iterator/discard_iterator.h>
22: #include <thrust/binary_search.h>
23: #include <thrust/remove.h>
24: #include <thrust/sort.h>
25: #include <thrust/unique.h>
27: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
28: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
29: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
30: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
32: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
33: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
35: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
36: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
38: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
40: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
42: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
43: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
44: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
45: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
46: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
47: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
48: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
49: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
50: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
51: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
52: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
53: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
55: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
56: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **);
57: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
58: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
59: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
60: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
61: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
62: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
63: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
65: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(Mat, Mat, Mat, PetscBool, PetscBool);
66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
68: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse_band(Mat, MatFactorType, Mat *);
70: /*
71: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
72: {
73: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
75: PetscFunctionBegin;
76: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
77: hipsparsestruct->stream = stream;
78: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
83: {
84: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
86: PetscFunctionBegin;
87: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
88: if (hipsparsestruct->handle != handle) {
89: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
90: hipsparsestruct->handle = handle;
91: }
92: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
93: PetscFunctionReturn(PETSC_SUCCESS);
94: }
96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
97: {
98: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
99: PetscBool flg;
101: PetscFunctionBegin;
102: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
103: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
104: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
107: */
109: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
110: {
111: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
113: PetscFunctionBegin;
114: switch (op) {
115: case MAT_HIPSPARSE_MULT:
116: hipsparsestruct->format = format;
117: break;
118: case MAT_HIPSPARSE_ALL:
119: hipsparsestruct->format = format;
120: break;
121: default:
122: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
123: }
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: /*@
128: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
129: operation. Only the `MatMult()` operation can use different GPU storage formats
131: Not Collective
133: Input Parameters:
134: + A - Matrix of type `MATSEQAIJHIPSPARSE`
135: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
136: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
137: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
139: Level: intermediate
141: .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
144: {
145: PetscFunctionBegin;
147: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
148: PetscFunctionReturn(PETSC_SUCCESS);
149: }
151: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
152: {
153: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
155: PetscFunctionBegin;
156: hipsparsestruct->use_cpu_solve = use_cpu;
157: PetscFunctionReturn(PETSC_SUCCESS);
158: }
160: /*@
161: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
163: Input Parameters:
164: + A - Matrix of type `MATSEQAIJHIPSPARSE`
165: - use_cpu - set flag for using the built-in CPU `MatSolve()`
167: Level: intermediate
169: Notes:
170: The hipSparse LU solver currently computes the factors with the built-in CPU method
171: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
172: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
174: .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
175: @*/
176: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
177: {
178: PetscFunctionBegin;
180: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
181: PetscFunctionReturn(PETSC_SUCCESS);
182: }
184: PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
185: {
186: PetscFunctionBegin;
187: switch (op) {
188: case MAT_FORM_EXPLICIT_TRANSPOSE:
189: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
190: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
191: A->form_explicit_transpose = flg;
192: break;
193: default:
194: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
195: break;
196: }
197: PetscFunctionReturn(PETSC_SUCCESS);
198: }
200: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
201: {
202: PetscBool row_identity, col_identity;
203: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
204: IS isrow = b->row, iscol = b->col;
205: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
207: PetscFunctionBegin;
208: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
209: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
210: B->offloadmask = PETSC_OFFLOAD_CPU;
211: /* determine which version of MatSolve needs to be used. */
212: PetscCall(ISIdentity(isrow, &row_identity));
213: PetscCall(ISIdentity(iscol, &col_identity));
214: if (!hipsparsestruct->use_cpu_solve) {
215: if (row_identity && col_identity) {
216: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
217: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
218: } else {
219: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
220: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
221: }
222: }
223: B->ops->matsolve = NULL;
224: B->ops->matsolvetranspose = NULL;
226: /* get the triangular factors */
227: if (!hipsparsestruct->use_cpu_solve) { PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); }
228: PetscFunctionReturn(PETSC_SUCCESS);
229: }
231: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
232: {
233: MatHIPSPARSEStorageFormat format;
234: PetscBool flg;
235: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
237: PetscFunctionBegin;
238: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
239: if (A->factortype == MAT_FACTOR_NONE) {
240: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
241: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
242: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
243: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
244: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
245: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
246: PetscCall(
247: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
248: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
249: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250: PetscCall(
251: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
252: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253: /*
254: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
255: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
256: */
257: }
258: PetscOptionsHeadEnd();
259: PetscFunctionReturn(PETSC_SUCCESS);
260: }
262: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
263: {
264: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
265: PetscInt n = A->rmap->n;
266: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
267: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
268: const PetscInt *ai = a->i, *aj = a->j, *vi;
269: const MatScalar *aa = a->a, *v;
270: PetscInt *AiLo, *AjLo;
271: PetscInt i, nz, nzLower, offset, rowOffset;
273: PetscFunctionBegin;
274: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
275: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
276: try {
277: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278: nzLower = n + ai[n] - ai[1];
279: if (!loTriFactor) {
280: PetscScalar *AALo;
281: PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
283: /* Allocate Space for the lower triangular matrix */
284: PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
285: PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
287: /* Fill the lower triangular matrix */
288: AiLo[0] = (PetscInt)0;
289: AiLo[n] = nzLower;
290: AjLo[0] = (PetscInt)0;
291: AALo[0] = (MatScalar)1.0;
292: v = aa;
293: vi = aj;
294: offset = 1;
295: rowOffset = 1;
296: for (i = 1; i < n; i++) {
297: nz = ai[i + 1] - ai[i];
298: /* additional 1 for the term on the diagonal */
299: AiLo[i] = rowOffset;
300: rowOffset += nz + 1;
302: PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
303: PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
304: offset += nz;
305: AjLo[offset] = (PetscInt)i;
306: AALo[offset] = (MatScalar)1.0;
307: offset += 1;
308: v += nz;
309: vi += nz;
310: }
312: /* allocate space for the triangular factor information */
313: PetscCall(PetscNew(&loTriFactor));
314: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
315: /* Create the matrix description */
316: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
317: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
318: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
319: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
320: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
322: /* set the operation */
323: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
325: /* set the matrix */
326: loTriFactor->csrMat = new CsrMatrix;
327: loTriFactor->csrMat->num_rows = n;
328: loTriFactor->csrMat->num_cols = n;
329: loTriFactor->csrMat->num_entries = nzLower;
330: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
331: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
334: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
335: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
336: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
338: /* Create the solve analysis information */
339: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
340: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
341: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
342: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
343: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
345: /* perform the solve analysis */
346: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
347: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
349: PetscCallHIP(WaitForHIP());
350: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
352: /* assign the pointer */
353: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
354: loTriFactor->AA_h = AALo;
355: PetscCallHIP(hipHostFree(AiLo));
356: PetscCallHIP(hipHostFree(AjLo));
357: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
358: } else { /* update values only */
359: if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
360: /* Fill the lower triangular matrix */
361: loTriFactor->AA_h[0] = 1.0;
362: v = aa;
363: vi = aj;
364: offset = 1;
365: for (i = 1; i < n; i++) {
366: nz = ai[i + 1] - ai[i];
367: PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
368: offset += nz;
369: loTriFactor->AA_h[offset] = 1.0;
370: offset += 1;
371: v += nz;
372: }
373: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
374: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
375: }
376: } catch (char *ex) {
377: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
378: }
379: }
380: PetscFunctionReturn(PETSC_SUCCESS);
381: }
383: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
386: PetscInt n = A->rmap->n;
387: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
388: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
389: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
390: const MatScalar *aa = a->a, *v;
391: PetscInt *AiUp, *AjUp;
392: PetscInt i, nz, nzUpper, offset;
394: PetscFunctionBegin;
395: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
396: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397: try {
398: /* next, figure out the number of nonzeros in the upper triangular matrix. */
399: nzUpper = adiag[0] - adiag[n];
400: if (!upTriFactor) {
401: PetscScalar *AAUp;
402: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
404: /* Allocate Space for the upper triangular matrix */
405: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
408: /* Fill the upper triangular matrix */
409: AiUp[0] = (PetscInt)0;
410: AiUp[n] = nzUpper;
411: offset = nzUpper;
412: for (i = n - 1; i >= 0; i--) {
413: v = aa + adiag[i + 1] + 1;
414: vi = aj + adiag[i + 1] + 1;
415: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416: offset -= (nz + 1); /* decrement the offset */
418: /* first, set the diagonal elements */
419: AjUp[offset] = (PetscInt)i;
420: AAUp[offset] = (MatScalar)1. / v[nz];
421: AiUp[i] = AiUp[i + 1] - (nz + 1);
423: PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
424: PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
425: }
427: /* allocate space for the triangular factor information */
428: PetscCall(PetscNew(&upTriFactor));
429: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
431: /* Create the matrix description */
432: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
438: /* set the operation */
439: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
441: /* set the matrix */
442: upTriFactor->csrMat = new CsrMatrix;
443: upTriFactor->csrMat->num_rows = n;
444: upTriFactor->csrMat->num_cols = n;
445: upTriFactor->csrMat->num_entries = nzUpper;
446: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
447: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
449: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
453: /* Create the solve analysis information */
454: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
460: /* perform the solve analysis */
461: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
464: PetscCallHIP(WaitForHIP());
465: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
467: /* assign the pointer */
468: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469: upTriFactor->AA_h = AAUp;
470: PetscCallHIP(hipHostFree(AiUp));
471: PetscCallHIP(hipHostFree(AjUp));
472: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473: } else {
474: if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475: /* Fill the upper triangular matrix */
476: offset = nzUpper;
477: for (i = n - 1; i >= 0; i--) {
478: v = aa + adiag[i + 1] + 1;
479: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480: offset -= (nz + 1); /* decrement the offset */
482: /* first, set the diagonal elements */
483: upTriFactor->AA_h[offset] = 1. / v[nz];
484: PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
485: }
486: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488: }
489: } catch (char *ex) {
490: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491: }
492: }
493: PetscFunctionReturn(PETSC_SUCCESS);
494: }
496: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497: {
498: PetscBool row_identity, col_identity;
499: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
500: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501: IS isrow = a->row, iscol = a->icol;
502: PetscInt n = A->rmap->n;
504: PetscFunctionBegin;
505: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506: PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507: PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
509: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510: hipsparseTriFactors->nnz = a->nz;
512: A->offloadmask = PETSC_OFFLOAD_BOTH;
513: /* lower triangular indices */
514: PetscCall(ISIdentity(isrow, &row_identity));
515: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516: const PetscInt *r;
518: PetscCall(ISGetIndices(isrow, &r));
519: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520: hipsparseTriFactors->rpermIndices->assign(r, r + n);
521: PetscCall(ISRestoreIndices(isrow, &r));
522: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523: }
524: /* upper triangular indices */
525: PetscCall(ISIdentity(iscol, &col_identity));
526: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527: const PetscInt *c;
529: PetscCall(ISGetIndices(iscol, &c));
530: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531: hipsparseTriFactors->cpermIndices->assign(c, c + n);
532: PetscCall(ISRestoreIndices(iscol, &c));
533: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534: }
535: PetscFunctionReturn(PETSC_SUCCESS);
536: }
538: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539: {
540: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
541: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544: PetscInt *AiUp, *AjUp;
545: PetscScalar *AAUp;
546: PetscScalar *AALo;
547: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
549: const PetscInt *ai = b->i, *aj = b->j, *vj;
550: const MatScalar *aa = b->a, *v;
552: PetscFunctionBegin;
553: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555: try {
556: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557: PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558: if (!upTriFactor && !loTriFactor) {
559: /* Allocate Space for the upper triangular matrix */
560: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
563: /* Fill the upper triangular matrix */
564: AiUp[0] = (PetscInt)0;
565: AiUp[n] = nzUpper;
566: offset = 0;
567: for (i = 0; i < n; i++) {
568: /* set the pointers */
569: v = aa + ai[i];
570: vj = aj + ai[i];
571: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
573: /* first, set the diagonal elements */
574: AjUp[offset] = (PetscInt)i;
575: AAUp[offset] = (MatScalar)1.0 / v[nz];
576: AiUp[i] = offset;
577: AALo[offset] = (MatScalar)1.0 / v[nz];
579: offset += 1;
580: if (nz > 0) {
581: PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
582: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
583: for (j = offset; j < offset + nz; j++) {
584: AAUp[j] = -AAUp[j];
585: AALo[j] = AAUp[j] / v[nz];
586: }
587: offset += nz;
588: }
589: }
591: /* allocate space for the triangular factor information */
592: PetscCall(PetscNew(&upTriFactor));
593: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
595: /* Create the matrix description */
596: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
602: /* set the matrix */
603: upTriFactor->csrMat = new CsrMatrix;
604: upTriFactor->csrMat->num_rows = A->rmap->n;
605: upTriFactor->csrMat->num_cols = A->cmap->n;
606: upTriFactor->csrMat->num_entries = a->nz;
607: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
608: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
610: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
614: /* set the operation */
615: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
617: /* Create the solve analysis information */
618: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
624: /* perform the solve analysis */
625: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
628: PetscCallHIP(WaitForHIP());
629: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
631: /* assign the pointer */
632: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
634: /* allocate space for the triangular factor information */
635: PetscCall(PetscNew(&loTriFactor));
636: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
638: /* Create the matrix description */
639: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
645: /* set the operation */
646: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
648: /* set the matrix */
649: loTriFactor->csrMat = new CsrMatrix;
650: loTriFactor->csrMat->num_rows = A->rmap->n;
651: loTriFactor->csrMat->num_cols = A->cmap->n;
652: loTriFactor->csrMat->num_entries = a->nz;
653: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
654: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
660: /* Create the solve analysis information */
661: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
667: /* perform the solve analysis */
668: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
671: PetscCallHIP(WaitForHIP());
672: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
674: /* assign the pointer */
675: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
677: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678: PetscCallHIP(hipHostFree(AiUp));
679: PetscCallHIP(hipHostFree(AjUp));
680: } else {
681: /* Fill the upper triangular matrix */
682: offset = 0;
683: for (i = 0; i < n; i++) {
684: /* set the pointers */
685: v = aa + ai[i];
686: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
688: /* first, set the diagonal elements */
689: AAUp[offset] = 1.0 / v[nz];
690: AALo[offset] = 1.0 / v[nz];
692: offset += 1;
693: if (nz > 0) {
694: PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
695: for (j = offset; j < offset + nz; j++) {
696: AAUp[j] = -AAUp[j];
697: AALo[j] = AAUp[j] / v[nz];
698: }
699: offset += nz;
700: }
701: }
702: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707: }
708: PetscCallHIP(hipHostFree(AAUp));
709: PetscCallHIP(hipHostFree(AALo));
710: } catch (char *ex) {
711: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712: }
713: }
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718: {
719: PetscBool perm_identity;
720: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
721: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722: IS ip = a->row;
723: PetscInt n = A->rmap->n;
725: PetscFunctionBegin;
726: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727: PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
731: A->offloadmask = PETSC_OFFLOAD_BOTH;
732: /* lower triangular indices */
733: PetscCall(ISIdentity(ip, &perm_identity));
734: if (!perm_identity) {
735: IS iip;
736: const PetscInt *irip, *rip;
738: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739: PetscCall(ISGetIndices(iip, &irip));
740: PetscCall(ISGetIndices(ip, &rip));
741: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745: PetscCall(ISRestoreIndices(iip, &irip));
746: PetscCall(ISDestroy(&iip));
747: PetscCall(ISRestoreIndices(ip, &rip));
748: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749: }
750: PetscFunctionReturn(PETSC_SUCCESS);
751: }
753: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754: {
755: PetscBool perm_identity;
756: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
757: IS ip = b->row;
759: PetscFunctionBegin;
760: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762: B->offloadmask = PETSC_OFFLOAD_CPU;
763: /* determine which version of MatSolve needs to be used. */
764: PetscCall(ISIdentity(ip, &perm_identity));
765: if (perm_identity) {
766: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768: B->ops->matsolve = NULL;
769: B->ops->matsolvetranspose = NULL;
770: } else {
771: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
772: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
773: B->ops->matsolve = NULL;
774: B->ops->matsolvetranspose = NULL;
775: }
777: /* get the triangular factors */
778: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779: PetscFunctionReturn(PETSC_SUCCESS);
780: }
782: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783: {
784: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789: hipsparseIndexBase_t indexBase;
790: hipsparseMatrixType_t matrixType;
791: hipsparseFillMode_t fillMode;
792: hipsparseDiagType_t diagType;
794: PetscFunctionBegin;
795: /* allocate space for the transpose of the lower triangular factor */
796: PetscCall(PetscNew(&loTriFactorT));
797: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
799: /* set the matrix descriptors of the lower triangular factor */
800: matrixType = hipsparseGetMatType(loTriFactor->descr);
801: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
802: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
805: /* Create the matrix description */
806: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
812: /* set the operation */
813: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
815: /* allocate GPU space for the CSC of the lower triangular factor*/
816: loTriFactorT->csrMat = new CsrMatrix;
817: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
818: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
819: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
820: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
824: /* compute the transpose of the lower triangular factor, i.e. the CSC */
825: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830: PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831: #endif
832: */
833: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
835: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows,
836: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
837: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
839: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
840: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
841: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
842: #else
843: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
844: #endif
846: PetscCallHIP(WaitForHIP());
847: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
849: /* Create the solve analysis information */
850: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
851: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
852: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp,
853: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
854: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
855: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
856: &loTriFactorT->solveBufferSize));
857: PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
859: /* perform the solve analysis */
860: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp,
861: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
862: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
863: loTriFactorT->csrMat->column_indices->data().get(),
864: loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
866: PetscCallHIP(WaitForHIP());
867: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
869: /* assign the pointer */
870: ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
872: /*********************************************/
873: /* Now the Transpose of the Upper Tri Factor */
874: /*********************************************/
876: /* allocate space for the transpose of the upper triangular factor */
877: PetscCall(PetscNew(&upTriFactorT));
878: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
880: /* set the matrix descriptors of the upper triangular factor */
881: matrixType = hipsparseGetMatType(upTriFactor->descr);
882: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
883: fillMode = hipsparseGetMatFillMode(upTriFactor->descr)== HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
884: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
886: /* Create the matrix description */
887: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
888: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
889: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
890: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
891: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
893: /* set the operation */
894: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
896: /* allocate GPU space for the CSC of the upper triangular factor*/
897: upTriFactorT->csrMat = new CsrMatrix;
898: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
899: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
900: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
901: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
902: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
903: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
905: /* compute the transpose of the upper triangular factor, i.e. the CSC */
906: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
907: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
908: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
909: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
910: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
911: PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
912: #endif
913: */
914: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
915: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows,
916: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
917: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
918: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
919: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
920: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
921: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
922: #else
923: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
924: #endif
926: PetscCallHIP(WaitForHIP());
927: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
929: /* Create the solve analysis information */
930: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
931: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
932: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp,
933: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
934: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
935: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
936: &upTriFactorT->solveBufferSize));
937: PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
939: /* perform the solve analysis */
940: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp,
941: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
942: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
943: upTriFactorT->csrMat->column_indices->data().get(),
944: upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
946: PetscCallHIP(WaitForHIP());
947: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
949: /* assign the pointer */
950: ((Mat_SeqAIJHIPSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
951: PetscFunctionReturn(PETSC_SUCCESS);
952: }
954: struct PetscScalarToPetscInt {
955: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
956: };
958: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
959: {
960: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
961: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
962: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
963: hipsparseIndexBase_t indexBase;
965: PetscFunctionBegin;
966: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
967: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
968: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
969: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
970: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
971: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
972: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
973: PetscCall(PetscLogGpuTimeBegin());
974: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
975: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
976: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
977: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
978: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
979: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
980: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
982: /* set alpha and beta */
983: PetscCallHIP(hipMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
984: PetscCallHIP(hipMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
985: PetscCallHIP(hipMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
986: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
987: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
988: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
990: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
991: CsrMatrix *matrixT = new CsrMatrix;
992: matstructT->mat = matrixT;
993: matrixT->num_rows = A->cmap->n;
994: matrixT->num_cols = A->rmap->n;
995: matrixT->num_entries = a->nz;
996: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
997: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
998: matrixT->values = new THRUSTARRAY(a->nz);
1000: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1001: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1003: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1004: indexBase, hipsparse_scalartype));
1005: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
1006: CsrMatrix *temp = new CsrMatrix;
1007: CsrMatrix *tempT = new CsrMatrix;
1008: /* First convert HYB to CSR */
1009: temp->num_rows = A->rmap->n;
1010: temp->num_cols = A->cmap->n;
1011: temp->num_entries = a->nz;
1012: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1013: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1014: temp->values = new THRUSTARRAY(a->nz);
1016: PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1018: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1019: tempT->num_rows = A->rmap->n;
1020: tempT->num_cols = A->cmap->n;
1021: tempT->num_entries = a->nz;
1022: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1023: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1024: tempT->values = new THRUSTARRAY(a->nz);
1026: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1027: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1029: /* Last, convert CSC to HYB */
1030: hipsparseHybMat_t hybMat;
1031: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1032: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1033: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1035: /* assign the pointer */
1036: matstructT->mat = hybMat;
1037: A->transupdated = PETSC_TRUE;
1038: /* delete temporaries */
1039: if (tempT) {
1040: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1041: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1042: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1043: delete (CsrMatrix *)tempT;
1044: }
1045: if (temp) {
1046: if (temp->values) delete (THRUSTARRAY *)temp->values;
1047: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1048: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1049: delete (CsrMatrix *)temp;
1050: }
1051: }
1052: }
1053: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1054: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1055: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1056: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1057: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1058: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1059: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1060: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1061: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1062: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1063: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1064: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1065: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1066: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1067: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1068: }
1069: if (!hipsparsestruct->csr2csc_i) {
1070: THRUSTARRAY csr2csc_a(matrix->num_entries);
1071: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1073: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1074: if (matrix->num_entries) {
1075: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1076: Need to verify this for ROCm.
1077: */
1078: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1079: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1080: } else {
1081: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1082: }
1084: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1085: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1086: }
1087: PetscCallThrust(
1088: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1089: }
1090: PetscCall(PetscLogGpuTimeEnd());
1091: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1092: /* the compressed row indices is not used for matTranspose */
1093: matstructT->cprowIndices = NULL;
1094: /* assign the pointer */
1095: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1096: A->transupdated = PETSC_TRUE;
1097: PetscFunctionReturn(PETSC_SUCCESS);
1098: }
1100: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1101: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1102: {
1103: PetscInt n = xx->map->n;
1104: const PetscScalar *barray;
1105: PetscScalar *xarray;
1106: thrust::device_ptr<const PetscScalar> bGPU;
1107: thrust::device_ptr<PetscScalar> xGPU;
1108: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1109: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1110: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1111: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1113: PetscFunctionBegin;
1114: /* Analyze the matrix and create the transpose ... on the fly */
1115: if (!loTriFactorT && !upTriFactorT) {
1116: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1117: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1118: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1119: }
1121: /* Get the GPU pointers */
1122: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1123: PetscCall(VecHIPGetArrayRead(bb, &barray));
1124: xGPU = thrust::device_pointer_cast(xarray);
1125: bGPU = thrust::device_pointer_cast(barray);
1127: PetscCall(PetscLogGpuTimeBegin());
1128: /* First, reorder with the row permutation */
1129: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1131: /* First, solve U */
1132: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1133: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1135: /* Then, solve L */
1136: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1137: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1139: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1140: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1142: /* Copy the temporary to the full solution. */
1143: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1145: /* restore */
1146: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1147: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1148: PetscCall(PetscLogGpuTimeEnd());
1149: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1150: PetscFunctionReturn(PETSC_SUCCESS);
1151: }
1153: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1154: {
1155: const PetscScalar *barray;
1156: PetscScalar *xarray;
1157: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1158: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1159: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1160: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1162: PetscFunctionBegin;
1163: /* Analyze the matrix and create the transpose ... on the fly */
1164: if (!loTriFactorT && !upTriFactorT) {
1165: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1166: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1167: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1168: }
1170: /* Get the GPU pointers */
1171: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1172: PetscCall(VecHIPGetArrayRead(bb, &barray));
1174: PetscCall(PetscLogGpuTimeBegin());
1175: /* First, solve U */
1176: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1177: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1179: /* Then, solve L */
1180: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1181: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1183: /* restore */
1184: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1185: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1186: PetscCall(PetscLogGpuTimeEnd());
1187: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1188: PetscFunctionReturn(PETSC_SUCCESS);
1189: }
1191: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1192: {
1193: const PetscScalar *barray;
1194: PetscScalar *xarray;
1195: thrust::device_ptr<const PetscScalar> bGPU;
1196: thrust::device_ptr<PetscScalar> xGPU;
1197: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1198: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1199: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1200: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1202: PetscFunctionBegin;
1203: /* Get the GPU pointers */
1204: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1205: PetscCall(VecHIPGetArrayRead(bb, &barray));
1206: xGPU = thrust::device_pointer_cast(xarray);
1207: bGPU = thrust::device_pointer_cast(barray);
1209: PetscCall(PetscLogGpuTimeBegin());
1210: /* First, reorder with the row permutation */
1211: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1213: /* Next, solve L */
1214: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1215: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1217: /* Then, solve U */
1218: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1219: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1221: /* Last, reorder with the column permutation */
1222: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1224: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1225: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1226: PetscCall(PetscLogGpuTimeEnd());
1227: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1228: PetscFunctionReturn(PETSC_SUCCESS);
1229: }
1231: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1232: {
1233: const PetscScalar *barray;
1234: PetscScalar *xarray;
1235: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1236: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1237: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1238: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1240: PetscFunctionBegin;
1241: /* Get the GPU pointers */
1242: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1243: PetscCall(VecHIPGetArrayRead(bb, &barray));
1245: PetscCall(PetscLogGpuTimeBegin());
1246: /* First, solve L */
1247: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1248: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1250: /* Next, solve U */
1251: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1252: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1254: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1255: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1256: PetscCall(PetscLogGpuTimeEnd());
1257: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1258: PetscFunctionReturn(PETSC_SUCCESS);
1259: }
1261: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1262: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1263: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1264: {
1265: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1266: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1267: const PetscScalar *barray;
1268: PetscScalar *xarray;
1270: PetscFunctionBegin;
1271: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1272: PetscCall(VecHIPGetArrayRead(b, &barray));
1273: PetscCall(PetscLogGpuTimeBegin());
1275: /* Solve L*y = b */
1276: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1277: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1278: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1279: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1281: /* Solve U*x = y */
1282: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1283: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1284: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1286: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1287: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1289: PetscCall(PetscLogGpuTimeEnd());
1290: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1291: PetscFunctionReturn(PETSC_SUCCESS);
1292: }
1294: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1295: {
1296: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1297: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1298: const PetscScalar *barray;
1299: PetscScalar *xarray;
1301: PetscFunctionBegin;
1302: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1303: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1304: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1305: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1307: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1308: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1309: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1310: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1311: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1312: }
1314: if (!fs->updatedTransposeSpSVAnalysis) {
1315: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1317: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1318: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1319: }
1321: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1322: PetscCall(VecHIPGetArrayRead(b, &barray));
1323: PetscCall(PetscLogGpuTimeBegin());
1325: /* Solve Ut*y = b */
1326: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1327: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1328: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1329: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1331: /* Solve Lt*x = y */
1332: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1333: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1336: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338: PetscCall(PetscLogGpuTimeEnd());
1339: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340: PetscFunctionReturn(PETSC_SUCCESS);
1341: }
1343: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344: {
1345: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1347: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348: CsrMatrix *Acsr;
1349: PetscInt m, nz;
1350: PetscBool flg;
1352: PetscFunctionBegin;
1353: if (PetscDefined(USE_DEBUG)) {
1354: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356: }
1358: /* Copy A's value to fact */
1359: m = fact->rmap->n;
1360: nz = aij->nz;
1361: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362: Acsr = (CsrMatrix *)Acusp->mat->mat;
1363: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1365: /* Factorize fact inplace */
1366: if (m)
1367: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369: if (PetscDefined(USE_DEBUG)) {
1370: int numerical_zero;
1371: hipsparseStatus_t status;
1372: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374: }
1376: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1379: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1381: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1384: fact->offloadmask = PETSC_OFFLOAD_GPU;
1385: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387: fact->ops->matsolve = NULL;
1388: fact->ops->matsolvetranspose = NULL;
1389: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390: PetscFunctionReturn(PETSC_SUCCESS);
1391: }
1393: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394: {
1395: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1397: PetscInt m, nz;
1399: PetscFunctionBegin;
1400: if (PetscDefined(USE_DEBUG)) {
1401: PetscInt i;
1402: PetscBool flg, missing;
1404: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1405: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1406: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1407: PetscCall(MatMissingDiagonal(A, &missing, &i));
1408: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1409: }
1411: /* Free the old stale stuff */
1412: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1414: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1415: but they will not be used. Allocate them just for easy debugging.
1416: */
1417: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1419: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1420: fact->factortype = MAT_FACTOR_ILU;
1421: fact->info.factor_mallocs = 0;
1422: fact->info.fill_ratio_given = info->fill;
1423: fact->info.fill_ratio_needed = 1.0;
1425: aij->row = NULL;
1426: aij->col = NULL;
1428: /* ====================================================================== */
1429: /* Copy A's i, j to fact and also allocate the value array of fact. */
1430: /* We'll do in-place factorization on fact */
1431: /* ====================================================================== */
1432: const int *Ai, *Aj;
1434: m = fact->rmap->n;
1435: nz = aij->nz;
1437: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1438: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1439: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1440: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1441: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1444: /* ====================================================================== */
1445: /* Create descriptors for M, L, U */
1446: /* ====================================================================== */
1447: hipsparseFillMode_t fillMode;
1448: hipsparseDiagType_t diagType;
1450: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1451: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1452: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1454: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1455: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1456: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1457: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1458: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1459: */
1460: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1461: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1462: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1463: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1464: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1466: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1467: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1468: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1469: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1470: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1472: /* ========================================================================= */
1473: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1474: /* ========================================================================= */
1475: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1476: if (m)
1477: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1478: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1480: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1481: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1483: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1484: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1486: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1487: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1489: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1490: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1492: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1493: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1494: */
1495: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1496: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1497: fs->spsvBuffer_L = fs->factBuffer_M;
1498: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1499: } else {
1500: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1501: fs->spsvBuffer_U = fs->factBuffer_M;
1502: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1503: }
1505: /* ========================================================================== */
1506: /* Perform analysis of ilu0 on M, SpSv on L and U */
1507: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1508: /* ========================================================================== */
1509: int structural_zero;
1511: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1512: if (m)
1513: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1514: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1515: if (PetscDefined(USE_DEBUG)) {
1516: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1517: hipsparseStatus_t status;
1518: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1519: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1520: }
1522: /* Estimate FLOPs of the numeric factorization */
1523: {
1524: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1525: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1526: PetscLogDouble flops = 0.0;
1528: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1529: Ai = Aseq->i;
1530: Adiag = Aseq->diag;
1531: for (PetscInt i = 0; i < m; i++) {
1532: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1533: nzRow = Ai[i + 1] - Ai[i];
1534: nzLeft = Adiag[i] - Ai[i];
1535: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1536: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1537: */
1538: nzLeft = (nzRow - 1) / 2;
1539: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1540: }
1541: }
1542: fs->numericFactFlops = flops;
1543: }
1544: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1545: PetscFunctionReturn(PETSC_SUCCESS);
1546: }
1548: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1549: {
1550: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1551: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1552: const PetscScalar *barray;
1553: PetscScalar *xarray;
1555: PetscFunctionBegin;
1556: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1557: PetscCall(VecHIPGetArrayRead(b, &barray));
1558: PetscCall(PetscLogGpuTimeBegin());
1560: /* Solve L*y = b */
1561: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1562: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1563: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1566: /* Solve Lt*x = y */
1567: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1568: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1569: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1571: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1572: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1574: PetscCall(PetscLogGpuTimeEnd());
1575: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1576: PetscFunctionReturn(PETSC_SUCCESS);
1577: }
1579: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1580: {
1581: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1582: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1583: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1584: CsrMatrix *Acsr;
1585: PetscInt m, nz;
1586: PetscBool flg;
1588: PetscFunctionBegin;
1589: if (PetscDefined(USE_DEBUG)) {
1590: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1591: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1592: }
1594: /* Copy A's value to fact */
1595: m = fact->rmap->n;
1596: nz = aij->nz;
1597: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1598: Acsr = (CsrMatrix *)Acusp->mat->mat;
1599: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1601: /* Factorize fact inplace */
1602: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1603: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1604: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1605: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1606: */
1607: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1608: if (PetscDefined(USE_DEBUG)) {
1609: int numerical_zero;
1610: hipsparseStatus_t status;
1611: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1612: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1613: }
1615: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1617: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1618: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1619: */
1620: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1622: fact->offloadmask = PETSC_OFFLOAD_GPU;
1623: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1624: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1625: fact->ops->matsolve = NULL;
1626: fact->ops->matsolvetranspose = NULL;
1627: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1628: PetscFunctionReturn(PETSC_SUCCESS);
1629: }
1631: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1632: {
1633: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1634: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1635: PetscInt m, nz;
1637: PetscFunctionBegin;
1638: if (PetscDefined(USE_DEBUG)) {
1639: PetscInt i;
1640: PetscBool flg, missing;
1642: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1643: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1644: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1645: PetscCall(MatMissingDiagonal(A, &missing, &i));
1646: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1647: }
1649: /* Free the old stale stuff */
1650: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1652: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1653: but they will not be used. Allocate them just for easy debugging.
1654: */
1655: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1657: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1658: fact->factortype = MAT_FACTOR_ICC;
1659: fact->info.factor_mallocs = 0;
1660: fact->info.fill_ratio_given = info->fill;
1661: fact->info.fill_ratio_needed = 1.0;
1663: aij->row = NULL;
1664: aij->col = NULL;
1666: /* ====================================================================== */
1667: /* Copy A's i, j to fact and also allocate the value array of fact. */
1668: /* We'll do in-place factorization on fact */
1669: /* ====================================================================== */
1670: const int *Ai, *Aj;
1672: m = fact->rmap->n;
1673: nz = aij->nz;
1675: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1676: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1677: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1678: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1679: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1680: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1682: /* ====================================================================== */
1683: /* Create mat descriptors for M, L */
1684: /* ====================================================================== */
1685: hipsparseFillMode_t fillMode;
1686: hipsparseDiagType_t diagType;
1688: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1689: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1690: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1692: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1693: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1694: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1695: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1696: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1697: */
1698: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1699: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1700: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1701: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1702: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1704: /* ========================================================================= */
1705: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1706: /* ========================================================================= */
1707: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1708: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1710: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1711: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1713: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1714: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1716: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1717: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1719: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1720: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1722: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1723: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1724: */
1725: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1726: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1727: fs->spsvBuffer_L = fs->factBuffer_M;
1728: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1729: } else {
1730: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1731: fs->spsvBuffer_Lt = fs->factBuffer_M;
1732: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1733: }
1735: /* ========================================================================== */
1736: /* Perform analysis of ic0 on M */
1737: /* The lower triangular part of M has the same sparsity pattern as L */
1738: /* ========================================================================== */
1739: int structural_zero;
1741: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1742: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1743: if (PetscDefined(USE_DEBUG)) {
1744: hipsparseStatus_t status;
1745: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1746: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1747: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1748: }
1750: /* Estimate FLOPs of the numeric factorization */
1751: {
1752: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1753: PetscInt *Ai, nzRow, nzLeft;
1754: PetscLogDouble flops = 0.0;
1756: Ai = Aseq->i;
1757: for (PetscInt i = 0; i < m; i++) {
1758: nzRow = Ai[i + 1] - Ai[i];
1759: if (nzRow > 1) {
1760: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1761: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1762: */
1763: nzLeft = (nzRow - 1) / 2;
1764: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1765: }
1766: }
1767: fs->numericFactFlops = flops;
1768: }
1769: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1770: PetscFunctionReturn(PETSC_SUCCESS);
1771: }
1772: #endif
1774: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1775: {
1776: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1778: PetscFunctionBegin;
1779: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1780: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1781: if (hipsparseTriFactors->factorizeOnDevice) {
1782: PetscCall(ISIdentity(isrow, &row_identity));
1783: PetscCall(ISIdentity(iscol, &col_identity));
1784: }
1785: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1786: else
1787: #endif
1788: {
1789: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1790: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1791: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1792: }
1793: PetscFunctionReturn(PETSC_SUCCESS);
1794: }
1796: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1797: {
1798: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1800: PetscFunctionBegin;
1801: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1802: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1803: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1804: PetscFunctionReturn(PETSC_SUCCESS);
1805: }
1807: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1808: {
1809: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1811: PetscFunctionBegin;
1812: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1813: PetscBool perm_identity = PETSC_FALSE;
1814: if (hipsparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1815: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1816: else
1817: #endif
1818: {
1819: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1820: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1821: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1822: }
1823: PetscFunctionReturn(PETSC_SUCCESS);
1824: }
1826: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1827: {
1828: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1830: PetscFunctionBegin;
1831: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1832: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1833: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1834: PetscFunctionReturn(PETSC_SUCCESS);
1835: }
1837: PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1838: {
1839: PetscFunctionBegin;
1840: *type = MATSOLVERHIPSPARSE;
1841: PetscFunctionReturn(PETSC_SUCCESS);
1842: }
1844: /*MC
1845: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1846: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1847: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1848: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1849: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1850: algorithms are not recommended. This class does NOT support direct solver operations.
1852: Level: beginner
1854: .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1855: M*/
1857: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1858: {
1859: PetscInt n = A->rmap->n;
1860: PetscBool factOnDevice, factOnHost;
1861: char *prefix;
1862: char factPlace[32] = "device"; /* the default */
1864: PetscFunctionBegin;
1865: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1866: PetscCall(MatSetSizes(*B, n, n, n, n));
1867: (*B)->factortype = ftype;
1868: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1870: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1871: PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1872: PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1873: PetscOptionsEnd();
1874: PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1875: PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1876: PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1877: ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1879: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1880: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1881: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1882: if (!A->boundtocpu) {
1883: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1884: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1885: } else {
1886: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1887: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1888: }
1889: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1890: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1891: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1892: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1893: if (!A->boundtocpu) {
1894: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1895: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1896: } else {
1897: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1898: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1899: }
1900: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1901: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1902: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1904: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1905: (*B)->canuseordering = PETSC_TRUE;
1906: PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1907: PetscFunctionReturn(PETSC_SUCCESS);
1908: }
1910: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1911: {
1912: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1913: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1914: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1915: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1916: #endif
1918: PetscFunctionBegin;
1919: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1920: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1921: if (A->factortype == MAT_FACTOR_NONE) {
1922: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1923: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1924: }
1925: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1926: else if (fs->csrVal) {
1927: /* We have a factorized matrix on device and are able to copy it to host */
1928: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1929: }
1930: #endif
1931: else
1932: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1933: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1934: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1935: A->offloadmask = PETSC_OFFLOAD_BOTH;
1936: }
1937: PetscFunctionReturn(PETSC_SUCCESS);
1938: }
1940: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1941: {
1942: PetscFunctionBegin;
1943: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1944: *array = ((Mat_SeqAIJ *)A->data)->a;
1945: PetscFunctionReturn(PETSC_SUCCESS);
1946: }
1948: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1949: {
1950: PetscFunctionBegin;
1951: A->offloadmask = PETSC_OFFLOAD_CPU;
1952: *array = NULL;
1953: PetscFunctionReturn(PETSC_SUCCESS);
1954: }
1956: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1957: {
1958: PetscFunctionBegin;
1959: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1960: *array = ((Mat_SeqAIJ *)A->data)->a;
1961: PetscFunctionReturn(PETSC_SUCCESS);
1962: }
1964: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1965: {
1966: PetscFunctionBegin;
1967: *array = NULL;
1968: PetscFunctionReturn(PETSC_SUCCESS);
1969: }
1971: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1972: {
1973: PetscFunctionBegin;
1974: *array = ((Mat_SeqAIJ *)A->data)->a;
1975: PetscFunctionReturn(PETSC_SUCCESS);
1976: }
1978: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1979: {
1980: PetscFunctionBegin;
1981: A->offloadmask = PETSC_OFFLOAD_CPU;
1982: *array = NULL;
1983: PetscFunctionReturn(PETSC_SUCCESS);
1984: }
1986: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1987: {
1988: Mat_SeqAIJHIPSPARSE *cusp;
1989: CsrMatrix *matrix;
1991: PetscFunctionBegin;
1992: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1993: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1994: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1995: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1996: matrix = (CsrMatrix *)cusp->mat->mat;
1998: if (i) {
1999: #if !defined(PETSC_USE_64BIT_INDICES)
2000: *i = matrix->row_offsets->data().get();
2001: #else
2002: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2003: #endif
2004: }
2005: if (j) {
2006: #if !defined(PETSC_USE_64BIT_INDICES)
2007: *j = matrix->column_indices->data().get();
2008: #else
2009: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2010: #endif
2011: }
2012: if (a) *a = matrix->values->data().get();
2013: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2014: PetscFunctionReturn(PETSC_SUCCESS);
2015: }
2017: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2018: {
2019: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2020: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2021: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2022: PetscBool both = PETSC_TRUE;
2023: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2025: PetscFunctionBegin;
2026: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2027: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2028: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2029: CsrMatrix *matrix;
2030: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2032: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2033: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2034: matrix->values->assign(a->a, a->a + a->nz);
2035: PetscCallHIP(WaitForHIP());
2036: PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
2037: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2038: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2039: } else {
2040: PetscInt nnz;
2041: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2042: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2043: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2044: delete hipsparsestruct->workVector;
2045: delete hipsparsestruct->rowoffsets_gpu;
2046: hipsparsestruct->workVector = NULL;
2047: hipsparsestruct->rowoffsets_gpu = NULL;
2048: try {
2049: if (a->compressedrow.use) {
2050: m = a->compressedrow.nrows;
2051: ii = a->compressedrow.i;
2052: ridx = a->compressedrow.rindex;
2053: } else {
2054: m = A->rmap->n;
2055: ii = a->i;
2056: ridx = NULL;
2057: }
2058: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2059: if (!a->a) {
2060: nnz = ii[m];
2061: both = PETSC_FALSE;
2062: } else nnz = a->nz;
2063: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2065: /* create hipsparse matrix */
2066: hipsparsestruct->nrows = m;
2067: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2068: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2069: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2070: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2072: PetscCallHIP(hipMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
2073: PetscCallHIP(hipMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
2074: PetscCallHIP(hipMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2075: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2076: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2077: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2078: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2080: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2081: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2082: /* set the matrix */
2083: CsrMatrix *mat = new CsrMatrix;
2084: mat->num_rows = m;
2085: mat->num_cols = A->cmap->n;
2086: mat->num_entries = nnz;
2087: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2088: mat->column_indices = new THRUSTINTARRAY32(nnz);
2089: mat->values = new THRUSTARRAY(nnz);
2090: mat->row_offsets->assign(ii, ii + m + 1);
2091: mat->column_indices->assign(a->j, a->j + nnz);
2092: if (a->a) mat->values->assign(a->a, a->a + nnz);
2094: /* assign the pointer */
2095: matstruct->mat = mat;
2096: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2097: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2098: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2099: }
2100: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2101: CsrMatrix *mat = new CsrMatrix;
2102: mat->num_rows = m;
2103: mat->num_cols = A->cmap->n;
2104: mat->num_entries = nnz;
2105: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2106: mat->column_indices = new THRUSTINTARRAY32(nnz);
2107: mat->values = new THRUSTARRAY(nnz);
2108: mat->row_offsets->assign(ii, ii + m + 1);
2109: mat->column_indices->assign(a->j, a->j + nnz);
2110: if (a->a) mat->values->assign(a->a, a->a + nnz);
2112: hipsparseHybMat_t hybMat;
2113: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2114: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2115: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2116: /* assign the pointer */
2117: matstruct->mat = hybMat;
2119: if (mat) {
2120: if (mat->values) delete (THRUSTARRAY *)mat->values;
2121: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2122: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2123: delete (CsrMatrix *)mat;
2124: }
2125: }
2127: /* assign the compressed row indices */
2128: if (a->compressedrow.use) {
2129: hipsparsestruct->workVector = new THRUSTARRAY(m);
2130: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2131: matstruct->cprowIndices->assign(ridx, ridx + m);
2132: tmp = m;
2133: } else {
2134: hipsparsestruct->workVector = NULL;
2135: matstruct->cprowIndices = NULL;
2136: tmp = 0;
2137: }
2138: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2140: /* assign the pointer */
2141: hipsparsestruct->mat = matstruct;
2142: } catch (char *ex) {
2143: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2144: }
2145: PetscCallHIP(WaitForHIP());
2146: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2147: hipsparsestruct->nonzerostate = A->nonzerostate;
2148: }
2149: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2150: }
2151: PetscFunctionReturn(PETSC_SUCCESS);
2152: }
2154: struct VecHIPPlusEquals {
2155: template <typename Tuple>
2156: __host__ __device__ void operator()(Tuple t)
2157: {
2158: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2159: }
2160: };
2162: struct VecHIPEquals {
2163: template <typename Tuple>
2164: __host__ __device__ void operator()(Tuple t)
2165: {
2166: thrust::get<1>(t) = thrust::get<0>(t);
2167: }
2168: };
2170: struct VecHIPEqualsReverse {
2171: template <typename Tuple>
2172: __host__ __device__ void operator()(Tuple t)
2173: {
2174: thrust::get<0>(t) = thrust::get<1>(t);
2175: }
2176: };
2178: struct MatMatHipsparse {
2179: PetscBool cisdense;
2180: PetscScalar *Bt;
2181: Mat X;
2182: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2183: PetscLogDouble flops;
2184: CsrMatrix *Bcsr;
2185: hipsparseSpMatDescr_t matSpBDescr;
2186: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2187: hipsparseDnMatDescr_t matBDescr;
2188: hipsparseDnMatDescr_t matCDescr;
2189: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2190: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2191: void *dBuffer4, *dBuffer5;
2192: #endif
2193: size_t mmBufferSize;
2194: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2195: hipsparseSpGEMMDescr_t spgemmDesc;
2196: };
2198: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2199: {
2200: MatMatHipsparse *mmdata = (MatMatHipsparse *)data;
2202: PetscFunctionBegin;
2203: PetscCallHIP(hipFree(mmdata->Bt));
2204: delete mmdata->Bcsr;
2205: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2206: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2207: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2208: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2209: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2210: if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2211: if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2212: #endif
2213: if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2214: if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2215: PetscCall(MatDestroy(&mmdata->X));
2216: PetscCall(PetscFree(data));
2217: PetscFunctionReturn(PETSC_SUCCESS);
2218: }
2220: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2221: {
2222: Mat_Product *product = C->product;
2223: Mat A, B;
2224: PetscInt m, n, blda, clda;
2225: PetscBool flg, biship;
2226: Mat_SeqAIJHIPSPARSE *cusp;
2227: hipsparseOperation_t opA;
2228: const PetscScalar *barray;
2229: PetscScalar *carray;
2230: MatMatHipsparse *mmdata;
2231: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2232: CsrMatrix *csrmat;
2234: PetscFunctionBegin;
2235: MatCheckProduct(C, 1);
2236: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2237: mmdata = (MatMatHipsparse *)product->data;
2238: A = product->A;
2239: B = product->B;
2240: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2241: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2242: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2243: Instead of silently accepting the wrong answer, I prefer to raise the error */
2244: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2245: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2246: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2247: switch (product->type) {
2248: case MATPRODUCT_AB:
2249: case MATPRODUCT_PtAP:
2250: mat = cusp->mat;
2251: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2252: m = A->rmap->n;
2253: n = B->cmap->n;
2254: break;
2255: case MATPRODUCT_AtB:
2256: if (!A->form_explicit_transpose) {
2257: mat = cusp->mat;
2258: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2259: } else {
2260: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2261: mat = cusp->matTranspose;
2262: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2263: }
2264: m = A->cmap->n;
2265: n = B->cmap->n;
2266: break;
2267: case MATPRODUCT_ABt:
2268: case MATPRODUCT_RARt:
2269: mat = cusp->mat;
2270: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2271: m = A->rmap->n;
2272: n = B->rmap->n;
2273: break;
2274: default:
2275: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2276: }
2277: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2278: csrmat = (CsrMatrix *)mat->mat;
2279: /* if the user passed a CPU matrix, copy the data to the GPU */
2280: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2281: if (!biship) { PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); }
2282: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2283: PetscCall(MatDenseGetLDA(B, &blda));
2284: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2285: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2286: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2287: } else {
2288: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2289: PetscCall(MatDenseGetLDA(C, &clda));
2290: }
2292: PetscCall(PetscLogGpuTimeBegin());
2293: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2294: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2295: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2296: size_t mmBufferSize;
2297: if (mmdata->initialized && mmdata->Blda != blda) {
2298: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2299: mmdata->matBDescr = NULL;
2300: }
2301: if (!mmdata->matBDescr) {
2302: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2303: mmdata->Blda = blda;
2304: }
2305: if (mmdata->initialized && mmdata->Clda != clda) {
2306: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2307: mmdata->matCDescr = NULL;
2308: }
2309: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2310: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2311: mmdata->Clda = clda;
2312: }
2313: if (!mat->matDescr) {
2314: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2315: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2316: }
2317: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2318: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2319: PetscCallHIP(hipFree(mmdata->mmBuffer));
2320: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2321: mmdata->mmBufferSize = mmBufferSize;
2322: }
2323: mmdata->initialized = PETSC_TRUE;
2324: } else {
2325: /* to be safe, always update pointers of the mats */
2326: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2327: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2328: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2329: }
2331: /* do hipsparseSpMM, which supports transpose on B */
2332: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2334: PetscCall(PetscLogGpuTimeEnd());
2335: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2336: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2337: if (product->type == MATPRODUCT_RARt) {
2338: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2339: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2340: } else if (product->type == MATPRODUCT_PtAP) {
2341: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2342: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2343: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2344: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2345: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2346: PetscFunctionReturn(PETSC_SUCCESS);
2347: }
2349: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2350: {
2351: Mat_Product *product = C->product;
2352: Mat A, B;
2353: PetscInt m, n;
2354: PetscBool cisdense, flg;
2355: MatMatHipsparse *mmdata;
2356: Mat_SeqAIJHIPSPARSE *cusp;
2358: PetscFunctionBegin;
2359: MatCheckProduct(C, 1);
2360: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2361: A = product->A;
2362: B = product->B;
2363: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2364: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2365: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2366: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2367: switch (product->type) {
2368: case MATPRODUCT_AB:
2369: m = A->rmap->n;
2370: n = B->cmap->n;
2371: break;
2372: case MATPRODUCT_AtB:
2373: m = A->cmap->n;
2374: n = B->cmap->n;
2375: break;
2376: case MATPRODUCT_ABt:
2377: m = A->rmap->n;
2378: n = B->rmap->n;
2379: break;
2380: case MATPRODUCT_PtAP:
2381: m = B->cmap->n;
2382: n = B->cmap->n;
2383: break;
2384: case MATPRODUCT_RARt:
2385: m = B->rmap->n;
2386: n = B->rmap->n;
2387: break;
2388: default:
2389: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2390: }
2391: PetscCall(MatSetSizes(C, m, n, m, n));
2392: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2393: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2394: PetscCall(MatSetType(C, MATSEQDENSEHIP));
2396: /* product data */
2397: PetscCall(PetscNew(&mmdata));
2398: mmdata->cisdense = cisdense;
2399: /* for these products we need intermediate storage */
2400: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2401: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2402: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2403: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2404: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2405: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2406: }
2407: C->product->data = mmdata;
2408: C->product->destroy = MatDestroy_MatMatHipsparse;
2409: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2410: PetscFunctionReturn(PETSC_SUCCESS);
2411: }
2413: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2414: {
2415: Mat_Product *product = C->product;
2416: Mat A, B;
2417: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2418: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2419: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2420: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2421: PetscBool flg;
2422: MatProductType ptype;
2423: MatMatHipsparse *mmdata;
2424: hipsparseSpMatDescr_t BmatSpDescr;
2425: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2427: PetscFunctionBegin;
2428: MatCheckProduct(C, 1);
2429: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2430: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2431: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2432: mmdata = (MatMatHipsparse *)C->product->data;
2433: A = product->A;
2434: B = product->B;
2435: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2436: mmdata->reusesym = PETSC_FALSE;
2437: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2438: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2439: Cmat = Ccusp->mat;
2440: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2441: Ccsr = (CsrMatrix *)Cmat->mat;
2442: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2443: goto finalize;
2444: }
2445: if (!c->nz) goto finalize;
2446: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2447: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2448: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2449: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2450: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2451: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2452: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2453: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2454: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2455: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2456: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2457: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2458: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2459: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2461: ptype = product->type;
2462: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2463: ptype = MATPRODUCT_AB;
2464: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2465: }
2466: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2467: ptype = MATPRODUCT_AB;
2468: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2469: }
2470: switch (ptype) {
2471: case MATPRODUCT_AB:
2472: Amat = Acusp->mat;
2473: Bmat = Bcusp->mat;
2474: break;
2475: case MATPRODUCT_AtB:
2476: Amat = Acusp->matTranspose;
2477: Bmat = Bcusp->mat;
2478: break;
2479: case MATPRODUCT_ABt:
2480: Amat = Acusp->mat;
2481: Bmat = Bcusp->matTranspose;
2482: break;
2483: default:
2484: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2485: }
2486: Cmat = Ccusp->mat;
2487: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2488: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2489: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2490: Acsr = (CsrMatrix *)Amat->mat;
2491: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2492: Ccsr = (CsrMatrix *)Cmat->mat;
2493: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2494: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2495: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2496: PetscCall(PetscLogGpuTimeBegin());
2497: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2498: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2499: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2500: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2501: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2502: #else
2503: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2504: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2505: #endif
2506: #else
2507: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2508: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2509: Ccsr->column_indices->data().get()));
2510: #endif
2511: PetscCall(PetscLogGpuFlops(mmdata->flops));
2512: PetscCallHIP(WaitForHIP());
2513: PetscCall(PetscLogGpuTimeEnd());
2514: C->offloadmask = PETSC_OFFLOAD_GPU;
2515: finalize:
2516: /* shorter version of MatAssemblyEnd_SeqAIJ */
2517: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2518: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2519: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2520: c->reallocs = 0;
2521: C->info.mallocs += 0;
2522: C->info.nz_unneeded = 0;
2523: C->assembled = C->was_assembled = PETSC_TRUE;
2524: C->num_ass++;
2525: PetscFunctionReturn(PETSC_SUCCESS);
2526: }
2528: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2529: {
2530: Mat_Product *product = C->product;
2531: Mat A, B;
2532: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2533: Mat_SeqAIJ *a, *b, *c;
2534: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2535: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2536: PetscInt i, j, m, n, k;
2537: PetscBool flg;
2538: MatProductType ptype;
2539: MatMatHipsparse *mmdata;
2540: PetscLogDouble flops;
2541: PetscBool biscompressed, ciscompressed;
2542: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2543: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2544: hipsparseSpMatDescr_t BmatSpDescr;
2545: #else
2546: int cnz;
2547: #endif
2548: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2550: PetscFunctionBegin;
2551: MatCheckProduct(C, 1);
2552: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2553: A = product->A;
2554: B = product->B;
2555: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2556: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2557: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2558: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2559: a = (Mat_SeqAIJ *)A->data;
2560: b = (Mat_SeqAIJ *)B->data;
2561: /* product data */
2562: PetscCall(PetscNew(&mmdata));
2563: C->product->data = mmdata;
2564: C->product->destroy = MatDestroy_MatMatHipsparse;
2566: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2567: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2568: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2569: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2570: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2571: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2573: ptype = product->type;
2574: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2575: ptype = MATPRODUCT_AB;
2576: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2577: }
2578: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2579: ptype = MATPRODUCT_AB;
2580: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2581: }
2582: biscompressed = PETSC_FALSE;
2583: ciscompressed = PETSC_FALSE;
2584: switch (ptype) {
2585: case MATPRODUCT_AB:
2586: m = A->rmap->n;
2587: n = B->cmap->n;
2588: k = A->cmap->n;
2589: Amat = Acusp->mat;
2590: Bmat = Bcusp->mat;
2591: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2592: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2593: break;
2594: case MATPRODUCT_AtB:
2595: m = A->cmap->n;
2596: n = B->cmap->n;
2597: k = A->rmap->n;
2598: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2599: Amat = Acusp->matTranspose;
2600: Bmat = Bcusp->mat;
2601: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2602: break;
2603: case MATPRODUCT_ABt:
2604: m = A->rmap->n;
2605: n = B->rmap->n;
2606: k = A->cmap->n;
2607: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2608: Amat = Acusp->mat;
2609: Bmat = Bcusp->matTranspose;
2610: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2611: break;
2612: default:
2613: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2614: }
2616: /* create hipsparse matrix */
2617: PetscCall(MatSetSizes(C, m, n, m, n));
2618: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2619: c = (Mat_SeqAIJ *)C->data;
2620: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2621: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2622: Ccsr = new CsrMatrix;
2624: c->compressedrow.use = ciscompressed;
2625: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2626: c->compressedrow.nrows = a->compressedrow.nrows;
2627: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2628: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2629: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2630: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2631: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2632: } else {
2633: c->compressedrow.nrows = 0;
2634: c->compressedrow.i = NULL;
2635: c->compressedrow.rindex = NULL;
2636: Ccusp->workVector = NULL;
2637: Cmat->cprowIndices = NULL;
2638: }
2639: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2640: Ccusp->mat = Cmat;
2641: Ccusp->mat->mat = Ccsr;
2642: Ccsr->num_rows = Ccusp->nrows;
2643: Ccsr->num_cols = n;
2644: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2645: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2646: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2647: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2648: PetscCallHIP(hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
2649: PetscCallHIP(hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
2650: PetscCallHIP(hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2651: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2652: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2653: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2654: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2655: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2656: c->nz = 0;
2657: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2658: Ccsr->values = new THRUSTARRAY(c->nz);
2659: goto finalizesym;
2660: }
2662: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2663: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2664: Acsr = (CsrMatrix *)Amat->mat;
2665: if (!biscompressed) {
2666: Bcsr = (CsrMatrix *)Bmat->mat;
2667: BmatSpDescr = Bmat->matDescr;
2668: } else { /* we need to use row offsets for the full matrix */
2669: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2670: Bcsr = new CsrMatrix;
2671: Bcsr->num_rows = B->rmap->n;
2672: Bcsr->num_cols = cBcsr->num_cols;
2673: Bcsr->num_entries = cBcsr->num_entries;
2674: Bcsr->column_indices = cBcsr->column_indices;
2675: Bcsr->values = cBcsr->values;
2676: if (!Bcusp->rowoffsets_gpu) {
2677: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2678: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2679: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2680: }
2681: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2682: mmdata->Bcsr = Bcsr;
2683: if (Bcsr->num_rows && Bcsr->num_cols) {
2684: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2685: }
2686: BmatSpDescr = mmdata->matSpBDescr;
2687: }
2688: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2689: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2690: /* precompute flops count */
2691: if (ptype == MATPRODUCT_AB) {
2692: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2693: const PetscInt st = a->i[i];
2694: const PetscInt en = a->i[i + 1];
2695: for (j = st; j < en; j++) {
2696: const PetscInt brow = a->j[j];
2697: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2698: }
2699: }
2700: } else if (ptype == MATPRODUCT_AtB) {
2701: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2702: const PetscInt anzi = a->i[i + 1] - a->i[i];
2703: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2704: flops += (2. * anzi) * bnzi;
2705: }
2706: } else flops = 0.; /* TODO */
2708: mmdata->flops = flops;
2709: PetscCall(PetscLogGpuTimeBegin());
2710: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2711: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2712: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2713: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2714: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2715: {
2716: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2717: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2718: */
2719: void *dBuffer1 = NULL;
2720: void *dBuffer2 = NULL;
2721: void *dBuffer3 = NULL;
2722: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2723: size_t bufferSize1 = 0;
2724: size_t bufferSize2 = 0;
2725: size_t bufferSize3 = 0;
2726: size_t bufferSize4 = 0;
2727: size_t bufferSize5 = 0;
2729: /* ask bufferSize1 bytes for external memory */
2730: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2731: PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2732: /* inspect the matrices A and B to understand the memory requirement for the next step */
2733: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2735: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2736: PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2737: PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2738: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2739: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2740: PetscCallHIP(hipFree(dBuffer1));
2741: PetscCallHIP(hipFree(dBuffer2));
2743: /* get matrix C non-zero entries C_nnz1 */
2744: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2745: c->nz = (PetscInt)C_nnz1;
2746: /* allocate matrix C */
2747: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2748: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2749: Ccsr->values = new THRUSTARRAY(c->nz);
2750: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2751: /* update matC with the new pointers */
2752: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2754: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2755: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2756: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2757: PetscCallHIP(hipFree(dBuffer3));
2758: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2759: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2760: }
2761: #else
2762: size_t bufSize2;
2763: /* ask bufferSize bytes for external memory */
2764: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2765: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2766: /* inspect the matrices A and B to understand the memory requirement for the next step */
2767: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2768: /* ask bufferSize again bytes for external memory */
2769: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2770: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2771: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2772: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2773: is stored in the descriptor! What a messy API... */
2774: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2775: /* compute the intermediate product of A * B */
2776: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2777: /* get matrix C non-zero entries C_nnz1 */
2778: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2779: c->nz = (PetscInt)C_nnz1;
2780: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2781: mmdata->mmBufferSize / 1024));
2782: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2783: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2784: Ccsr->values = new THRUSTARRAY(c->nz);
2785: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2786: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2787: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2788: #endif
2789: #else
2790: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2791: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2792: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2793: c->nz = cnz;
2794: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2795: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2796: Ccsr->values = new THRUSTARRAY(c->nz);
2797: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2799: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2800: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2801: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2802: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2803: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2804: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2805: Ccsr->column_indices->data().get()));
2806: #endif
2807: PetscCall(PetscLogGpuFlops(mmdata->flops));
2808: PetscCall(PetscLogGpuTimeEnd());
2809: finalizesym:
2810: c->singlemalloc = PETSC_FALSE;
2811: c->free_a = PETSC_TRUE;
2812: c->free_ij = PETSC_TRUE;
2813: PetscCall(PetscMalloc1(m + 1, &c->i));
2814: PetscCall(PetscMalloc1(c->nz, &c->j));
2815: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2816: PetscInt *d_i = c->i;
2817: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2818: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2819: ii = *Ccsr->row_offsets;
2820: jj = *Ccsr->column_indices;
2821: if (ciscompressed) d_i = c->compressedrow.i;
2822: PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824: } else {
2825: PetscInt *d_i = c->i;
2826: if (ciscompressed) d_i = c->compressedrow.i;
2827: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2828: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2829: }
2830: if (ciscompressed) { /* need to expand host row offsets */
2831: PetscInt r = 0;
2832: c->i[0] = 0;
2833: for (k = 0; k < c->compressedrow.nrows; k++) {
2834: const PetscInt next = c->compressedrow.rindex[k];
2835: const PetscInt old = c->compressedrow.i[k];
2836: for (; r < next; r++) c->i[r + 1] = old;
2837: }
2838: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2839: }
2840: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2841: PetscCall(PetscMalloc1(m, &c->ilen));
2842: PetscCall(PetscMalloc1(m, &c->imax));
2843: c->maxnz = c->nz;
2844: c->nonzerorowcnt = 0;
2845: c->rmax = 0;
2846: for (k = 0; k < m; k++) {
2847: const PetscInt nn = c->i[k + 1] - c->i[k];
2848: c->ilen[k] = c->imax[k] = nn;
2849: c->nonzerorowcnt += (PetscInt) !!nn;
2850: c->rmax = PetscMax(c->rmax, nn);
2851: }
2852: PetscCall(MatMarkDiagonal_SeqAIJ(C));
2853: PetscCall(PetscMalloc1(c->nz, &c->a));
2854: Ccsr->num_entries = c->nz;
2856: C->nonzerostate++;
2857: PetscCall(PetscLayoutSetUp(C->rmap));
2858: PetscCall(PetscLayoutSetUp(C->cmap));
2859: Ccusp->nonzerostate = C->nonzerostate;
2860: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2861: C->preallocated = PETSC_TRUE;
2862: C->assembled = PETSC_FALSE;
2863: C->was_assembled = PETSC_FALSE;
2864: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2865: mmdata->reusesym = PETSC_TRUE;
2866: C->offloadmask = PETSC_OFFLOAD_GPU;
2867: }
2868: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2869: PetscFunctionReturn(PETSC_SUCCESS);
2870: }
2872: /* handles sparse or dense B */
2873: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2874: {
2875: Mat_Product *product = mat->product;
2876: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2878: PetscFunctionBegin;
2879: MatCheckProduct(mat, 1);
2880: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2881: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2882: if (product->type == MATPRODUCT_ABC) {
2883: Ciscusp = PETSC_FALSE;
2884: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2885: }
2886: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2887: PetscBool usecpu = PETSC_FALSE;
2888: switch (product->type) {
2889: case MATPRODUCT_AB:
2890: if (product->api_user) {
2891: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2892: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2893: PetscOptionsEnd();
2894: } else {
2895: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2896: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2897: PetscOptionsEnd();
2898: }
2899: break;
2900: case MATPRODUCT_AtB:
2901: if (product->api_user) {
2902: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2903: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2904: PetscOptionsEnd();
2905: } else {
2906: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2907: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2908: PetscOptionsEnd();
2909: }
2910: break;
2911: case MATPRODUCT_PtAP:
2912: if (product->api_user) {
2913: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2914: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2915: PetscOptionsEnd();
2916: } else {
2917: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2918: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2919: PetscOptionsEnd();
2920: }
2921: break;
2922: case MATPRODUCT_RARt:
2923: if (product->api_user) {
2924: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2925: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2926: PetscOptionsEnd();
2927: } else {
2928: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2929: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2930: PetscOptionsEnd();
2931: }
2932: break;
2933: case MATPRODUCT_ABC:
2934: if (product->api_user) {
2935: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2936: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2937: PetscOptionsEnd();
2938: } else {
2939: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2940: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2941: PetscOptionsEnd();
2942: }
2943: break;
2944: default:
2945: break;
2946: }
2947: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2948: }
2949: /* dispatch */
2950: if (isdense) {
2951: switch (product->type) {
2952: case MATPRODUCT_AB:
2953: case MATPRODUCT_AtB:
2954: case MATPRODUCT_ABt:
2955: case MATPRODUCT_PtAP:
2956: case MATPRODUCT_RARt:
2957: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2958: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2959: break;
2960: case MATPRODUCT_ABC:
2961: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2962: break;
2963: default:
2964: break;
2965: }
2966: } else if (Biscusp && Ciscusp) {
2967: switch (product->type) {
2968: case MATPRODUCT_AB:
2969: case MATPRODUCT_AtB:
2970: case MATPRODUCT_ABt:
2971: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2972: break;
2973: case MATPRODUCT_PtAP:
2974: case MATPRODUCT_RARt:
2975: case MATPRODUCT_ABC:
2976: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2977: break;
2978: default:
2979: break;
2980: }
2981: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2982: PetscFunctionReturn(PETSC_SUCCESS);
2983: }
2985: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2986: {
2987: PetscFunctionBegin;
2988: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2989: PetscFunctionReturn(PETSC_SUCCESS);
2990: }
2992: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2993: {
2994: PetscFunctionBegin;
2995: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2996: PetscFunctionReturn(PETSC_SUCCESS);
2997: }
2999: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3000: {
3001: PetscFunctionBegin;
3002: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3003: PetscFunctionReturn(PETSC_SUCCESS);
3004: }
3006: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3007: {
3008: PetscFunctionBegin;
3009: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3010: PetscFunctionReturn(PETSC_SUCCESS);
3011: }
3013: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3014: {
3015: PetscFunctionBegin;
3016: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3017: PetscFunctionReturn(PETSC_SUCCESS);
3018: }
3020: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3021: {
3022: int i = blockIdx.x * blockDim.x + threadIdx.x;
3023: if (i < n) y[idx[i]] += x[i];
3024: }
3026: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3027: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3028: {
3029: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3030: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3031: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3032: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3033: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3034: PetscBool compressed;
3035: PetscInt nx, ny;
3037: PetscFunctionBegin;
3038: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3039: if (!a->nz) {
3040: if (yy) PetscCall(VecSeq_HIP::copy(yy, zz));
3041: else PetscCall(VecSeq_HIP::set(zz, 0));
3042: PetscFunctionReturn(PETSC_SUCCESS);
3043: }
3044: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3045: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3046: if (!trans) {
3047: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3048: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3049: } else {
3050: if (herm || !A->form_explicit_transpose) {
3051: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3052: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3053: } else {
3054: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3055: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3056: }
3057: }
3058: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3059: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3060: try {
3061: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3062: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3063: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3065: PetscCall(PetscLogGpuTimeBegin());
3066: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3067: /* z = A x + beta y.
3068: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3069: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3070: */
3071: xptr = xarray;
3072: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3073: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3074: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3075: allocated to accommodate different uses. So we get the length info directly from mat.
3076: */
3077: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3078: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3079: nx = mat->num_cols;
3080: ny = mat->num_rows;
3081: }
3082: } else {
3083: /* z = A^T x + beta y
3084: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3085: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3086: */
3087: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3088: dptr = zarray;
3089: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3090: if (compressed) { /* Scatter x to work vector */
3091: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3092: thrust::for_each(
3093: #if PetscDefined(HAVE_THRUST_ASYNC)
3094: thrust::hip::par.on(PetscDefaultHipStream),
3095: #endif
3096: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3097: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3098: }
3099: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3100: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3101: nx = mat->num_rows;
3102: ny = mat->num_cols;
3103: }
3104: }
3105: /* csr_spmv does y = alpha op(A) x + beta y */
3106: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3107: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3108: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3109: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3110: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3111: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3112: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3113: &matstruct->hipSpMV[opA].spmvBufferSize));
3114: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3115: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3116: } else {
3117: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3118: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3119: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3120: }
3121: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3122: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3123: #else
3124: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3125: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3126: #endif
3127: } else {
3128: if (hipsparsestruct->nrows) {
3129: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3130: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3131: }
3132: }
3133: PetscCall(PetscLogGpuTimeEnd());
3135: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3136: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3137: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3138: PetscCall(VecSeq_HIP::copy(yy, zz)); /* zz = yy */
3139: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3140: PetscCall(VecSeq_HIP::axpy(zz, 1.0, yy)); /* zz += yy */
3141: }
3142: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3143: PetscCall(VecSeq_HIP::set(zz, 0));
3144: }
3146: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3147: if (compressed) {
3148: PetscCall(PetscLogGpuTimeBegin());
3149: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3150: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3151: prevent that. So I just add a ScatterAdd kernel.
3152: */
3153: #if 0
3154: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3155: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3156: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3157: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3158: VecHIPPlusEquals());
3159: #else
3160: PetscInt n = matstruct->cprowIndices->size();
3161: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3162: #endif
3163: PetscCall(PetscLogGpuTimeEnd());
3164: }
3165: } else {
3166: if (yy && yy != zz) PetscCall(VecSeq_HIP::axpy(zz, 1.0, yy)); /* zz += yy */
3167: }
3168: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3169: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3170: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3171: } catch (char *ex) {
3172: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3173: }
3174: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3175: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3176: PetscFunctionReturn(PETSC_SUCCESS);
3177: }
3179: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3180: {
3181: PetscFunctionBegin;
3182: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3183: PetscFunctionReturn(PETSC_SUCCESS);
3184: }
3186: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3187: {
3188: PetscObjectState onnz = A->nonzerostate;
3189: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3191: PetscFunctionBegin;
3192: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3193: if (onnz != A->nonzerostate && cusp->deviceMat) {
3194: PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
3195: PetscCallHIP(hipFree(cusp->deviceMat));
3196: cusp->deviceMat = NULL;
3197: }
3198: PetscFunctionReturn(PETSC_SUCCESS);
3199: }
3201: /*@
3202: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3203: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3204: For good matrix assembly performance the user should preallocate the matrix storage by setting
3205: the parameter `nz` (or the array `nnz`).
3207: Collective
3209: Input Parameters:
3210: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3211: . m - number of rows
3212: . n - number of columns
3213: . nz - number of nonzeros per row (same for all rows)
3214: - nnz - array containing the number of nonzeros in the various rows
3215: (possibly different for each row) or `NULL`
3217: Output Parameter:
3218: . A - the matrix
3220: Level: intermediate
3222: Notes:
3223: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3224: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3225: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3227: If `nnz` is given then `nz` is ignored
3229: The AIJ format (compressed row storage), is fully compatible with standard Fortran
3230: storage. That is, the stored row and column indices can begin at
3231: either one (as in Fortran) or zero.
3233: Specify the preallocated storage with either `nz` or `nnz` (not both).
3234: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3235: allocation.
3237: By default, this format uses inodes (identical nodes) when possible, to
3238: improve numerical efficiency of matrix-vector products and solves. We
3239: search for consecutive rows with the same nonzero structure, thereby
3240: reusing matrix information to achieve increased efficiency.
3242: .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3243: @*/
3244: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3245: {
3246: PetscFunctionBegin;
3247: PetscCall(MatCreate(comm, A));
3248: PetscCall(MatSetSizes(*A, m, n, m, n));
3249: PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3250: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3251: PetscFunctionReturn(PETSC_SUCCESS);
3252: }
3254: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3255: {
3256: PetscFunctionBegin;
3257: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy((Mat_SeqAIJHIPSPARSE **)&A->spptr));
3258: else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3259: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3260: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3261: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3262: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3263: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3264: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3265: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3266: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3267: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3268: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3269: PetscCall(MatDestroy_SeqAIJ(A));
3270: PetscFunctionReturn(PETSC_SUCCESS);
3271: }
3273: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3274: {
3275: PetscFunctionBegin;
3276: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3277: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3278: PetscFunctionReturn(PETSC_SUCCESS);
3279: }
3281: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3282: {
3283: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3284: Mat_SeqAIJHIPSPARSE *cy;
3285: Mat_SeqAIJHIPSPARSE *cx;
3286: PetscScalar *ay;
3287: const PetscScalar *ax;
3288: CsrMatrix *csry, *csrx;
3290: PetscFunctionBegin;
3291: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3292: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3293: if (X->ops->axpy != Y->ops->axpy) {
3294: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3295: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3296: PetscFunctionReturn(PETSC_SUCCESS);
3297: }
3298: /* if we are here, it means both matrices are bound to GPU */
3299: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3300: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3301: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3302: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3303: csry = (CsrMatrix *)cy->mat->mat;
3304: csrx = (CsrMatrix *)cx->mat->mat;
3305: /* see if we can turn this into a hipblas axpy */
3306: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3307: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3308: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3309: if (eq) str = SAME_NONZERO_PATTERN;
3310: }
3311: /* spgeam is buggy with one column */
3312: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3313: if (str == SUBSET_NONZERO_PATTERN) {
3314: PetscScalar b = 1.0;
3315: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3316: size_t bufferSize;
3317: void *buffer;
3318: #endif
3320: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3321: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3322: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3323: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3324: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3325: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3326: PetscCallHIP(hipMalloc(&buffer, bufferSize));
3327: PetscCall(PetscLogGpuTimeBegin());
3328: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3329: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3330: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3331: PetscCall(PetscLogGpuTimeEnd());
3332: PetscCallHIP(hipFree(buffer));
3333: #else
3334: PetscCall(PetscLogGpuTimeBegin());
3335: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3336: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3337: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3338: PetscCall(PetscLogGpuTimeEnd());
3339: #endif
3340: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3341: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3342: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3343: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3344: } else if (str == SAME_NONZERO_PATTERN) {
3345: hipblasHandle_t hipblasv2handle;
3346: PetscBLASInt one = 1, bnz = 1;
3348: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3349: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3350: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3351: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3352: PetscCall(PetscLogGpuTimeBegin());
3353: PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3354: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3355: PetscCall(PetscLogGpuTimeEnd());
3356: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3357: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3358: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3359: } else {
3360: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3361: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3362: }
3363: PetscFunctionReturn(PETSC_SUCCESS);
3364: }
3366: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3367: {
3368: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3369: PetscScalar *ay;
3370: hipblasHandle_t hipblasv2handle;
3371: PetscBLASInt one = 1, bnz = 1;
3373: PetscFunctionBegin;
3374: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3375: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3376: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3377: PetscCall(PetscLogGpuTimeBegin());
3378: PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3379: PetscCall(PetscLogGpuFlops(bnz));
3380: PetscCall(PetscLogGpuTimeEnd());
3381: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3382: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3383: PetscFunctionReturn(PETSC_SUCCESS);
3384: }
3386: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3387: {
3388: PetscBool both = PETSC_FALSE;
3389: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3391: PetscFunctionBegin;
3392: if (A->factortype == MAT_FACTOR_NONE) {
3393: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3394: if (spptr->mat) {
3395: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3396: if (matrix->values) {
3397: both = PETSC_TRUE;
3398: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3399: }
3400: }
3401: if (spptr->matTranspose) {
3402: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3403: if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3404: }
3405: }
3406: //PetscCall(MatZeroEntries_SeqAIJ(A));
3407: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3408: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3409: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3410: else A->offloadmask = PETSC_OFFLOAD_CPU;
3411: PetscFunctionReturn(PETSC_SUCCESS);
3412: }
3414: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3415: {
3416: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3418: PetscFunctionBegin;
3419: if (A->factortype != MAT_FACTOR_NONE) {
3420: A->boundtocpu = flg;
3421: PetscFunctionReturn(PETSC_SUCCESS);
3422: }
3423: if (flg) {
3424: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3426: A->ops->scale = MatScale_SeqAIJ;
3427: A->ops->axpy = MatAXPY_SeqAIJ;
3428: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3429: A->ops->mult = MatMult_SeqAIJ;
3430: A->ops->multadd = MatMultAdd_SeqAIJ;
3431: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3432: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3433: A->ops->multhermitiantranspose = NULL;
3434: A->ops->multhermitiantransposeadd = NULL;
3435: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3436: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3437: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3438: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3439: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3440: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3441: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3442: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3443: } else {
3444: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3445: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3446: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3447: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3448: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3449: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3450: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3451: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3452: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3453: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3454: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3455: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3456: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3457: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3458: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3459: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3460: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3461: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3462: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3463: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3464: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3465: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3466: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3467: }
3468: A->boundtocpu = flg;
3469: if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3470: else a->inode.use = PETSC_FALSE;
3472: PetscFunctionReturn(PETSC_SUCCESS);
3473: }
3475: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3476: {
3477: Mat B;
3479: PetscFunctionBegin;
3480: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3481: if (reuse == MAT_INITIAL_MATRIX) {
3482: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3483: } else if (reuse == MAT_REUSE_MATRIX) {
3484: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3485: }
3486: B = *newmat;
3487: PetscCall(PetscFree(B->defaultvectype));
3488: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3489: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3490: if (B->factortype == MAT_FACTOR_NONE) {
3491: Mat_SeqAIJHIPSPARSE *spptr;
3492: PetscCall(PetscNew(&spptr));
3493: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3494: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3495: spptr->format = MAT_HIPSPARSE_CSR;
3496: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3497: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3498: #else
3499: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3500: #endif
3501: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3502: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3504: B->spptr = spptr;
3505: } else {
3506: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3508: PetscCall(PetscNew(&spptr));
3509: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3510: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3511: B->spptr = spptr;
3512: }
3513: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3514: }
3515: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3516: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3517: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3518: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3519: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3520: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3522: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3523: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3524: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3525: #if defined(PETSC_HAVE_HYPRE)
3526: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3527: #endif
3528: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3529: PetscFunctionReturn(PETSC_SUCCESS);
3530: }
3532: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3533: {
3534: PetscFunctionBegin;
3535: PetscCall(MatCreate_SeqAIJ(B));
3536: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3537: PetscFunctionReturn(PETSC_SUCCESS);
3538: }
3540: /*
3541: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3543: A matrix type type whose data resides on AMD GPUs. These matrices can be in either
3544: CSR, ELL, or Hybrid format.
3545: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3547: Options Database Keys:
3548: + -mat_type aijhipsparse - sets the matrix type to "seqaijhipsparse" during a call to MatSetFromOptions()
3549: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3550: Other options include ell (ellpack) or hyb (hybrid).
3551: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3552: + -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3554: Level: beginner
3556: .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3557: */
3558: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3559: {
3560: PetscFunctionBegin;
3561: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse_band));
3562: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3563: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3564: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3565: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3567: PetscFunctionReturn(PETSC_SUCCESS);
3568: }
3570: static PetscErrorCode MatResetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat)
3571: {
3572: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)mat->spptr;
3574: PetscFunctionBegin;
3575: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3576: delete cusp->cooPerm;
3577: delete cusp->cooPerm_a;
3578: cusp->cooPerm = NULL;
3579: cusp->cooPerm_a = NULL;
3580: if (cusp->use_extended_coo) {
3581: PetscCallHIP(hipFree(cusp->jmap_d));
3582: PetscCallHIP(hipFree(cusp->perm_d));
3583: }
3584: cusp->use_extended_coo = PETSC_FALSE;
3585: PetscFunctionReturn(PETSC_SUCCESS);
3586: }
3588: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat_SeqAIJHIPSPARSE **hipsparsestruct)
3589: {
3590: PetscFunctionBegin;
3591: if (*hipsparsestruct) {
3592: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->mat, (*hipsparsestruct)->format));
3593: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&(*hipsparsestruct)->matTranspose, (*hipsparsestruct)->format));
3594: delete (*hipsparsestruct)->workVector;
3595: delete (*hipsparsestruct)->rowoffsets_gpu;
3596: delete (*hipsparsestruct)->cooPerm;
3597: delete (*hipsparsestruct)->cooPerm_a;
3598: delete (*hipsparsestruct)->csr2csc_i;
3599: if ((*hipsparsestruct)->handle) PetscCallHIPSPARSE(hipsparseDestroy((*hipsparsestruct)->handle));
3600: if ((*hipsparsestruct)->jmap_d) PetscCallHIP(hipFree((*hipsparsestruct)->jmap_d));
3601: if ((*hipsparsestruct)->perm_d) PetscCallHIP(hipFree((*hipsparsestruct)->perm_d));
3602: PetscCall(PetscFree(*hipsparsestruct));
3603: }
3604: PetscFunctionReturn(PETSC_SUCCESS);
3605: }
3607: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3608: {
3609: PetscFunctionBegin;
3610: if (*mat) {
3611: delete (*mat)->values;
3612: delete (*mat)->column_indices;
3613: delete (*mat)->row_offsets;
3614: delete *mat;
3615: *mat = 0;
3616: }
3617: PetscFunctionReturn(PETSC_SUCCESS);
3618: }
3620: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3621: {
3622: PetscFunctionBegin;
3623: if (*trifactor) {
3624: if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3625: if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3626: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3627: if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3628: if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3629: if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3630: PetscCall(PetscFree(*trifactor));
3631: }
3632: PetscFunctionReturn(PETSC_SUCCESS);
3633: }
3635: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3636: {
3637: CsrMatrix *mat;
3639: PetscFunctionBegin;
3640: if (*matstruct) {
3641: if ((*matstruct)->mat) {
3642: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3643: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3644: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3645: } else {
3646: mat = (CsrMatrix *)(*matstruct)->mat;
3647: PetscCall(CsrMatrix_Destroy(&mat));
3648: }
3649: }
3650: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3651: delete (*matstruct)->cprowIndices;
3652: if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3653: if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3654: if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3656: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3657: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3658: for (int i = 0; i < 3; i++) {
3659: if (mdata->hipSpMV[i].initialized) {
3660: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3661: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3662: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3663: }
3664: }
3665: delete *matstruct;
3666: *matstruct = NULL;
3667: }
3668: PetscFunctionReturn(PETSC_SUCCESS);
3669: }
3671: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3672: {
3673: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3675: PetscFunctionBegin;
3676: if (fs) {
3677: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3678: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3679: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3680: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3681: delete fs->rpermIndices;
3682: delete fs->cpermIndices;
3683: delete fs->workVector;
3684: fs->rpermIndices = NULL;
3685: fs->cpermIndices = NULL;
3686: fs->workVector = NULL;
3687: if (fs->a_band_d) PetscCallHIP(hipFree(fs->a_band_d));
3688: if (fs->i_band_d) PetscCallHIP(hipFree(fs->i_band_d));
3689: fs->init_dev_prop = PETSC_FALSE;
3690: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3691: PetscCallHIP(hipFree(fs->csrRowPtr));
3692: PetscCallHIP(hipFree(fs->csrColIdx));
3693: PetscCallHIP(hipFree(fs->csrVal));
3694: PetscCallHIP(hipFree(fs->X));
3695: PetscCallHIP(hipFree(fs->Y));
3696: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3697: PetscCallHIP(hipFree(fs->spsvBuffer_L));
3698: PetscCallHIP(hipFree(fs->spsvBuffer_U));
3699: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3700: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3701: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3702: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3703: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3704: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3705: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3706: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3707: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3708: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3709: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3710: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3711: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3713: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3714: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3715: #endif
3716: }
3717: PetscFunctionReturn(PETSC_SUCCESS);
3718: }
3720: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3721: {
3722: hipsparseHandle_t handle;
3724: PetscFunctionBegin;
3725: if (*trifactors) {
3726: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3727: if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3728: PetscCall(PetscFree(*trifactors));
3729: }
3730: PetscFunctionReturn(PETSC_SUCCESS);
3731: }
3733: struct IJCompare {
3734: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3735: {
3736: if (t1.get<0>() < t2.get<0>()) return true;
3737: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3738: return false;
3739: }
3740: };
3742: struct IJEqual {
3743: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3744: {
3745: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3746: return true;
3747: }
3748: };
3750: struct IJDiff {
3751: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3752: };
3754: struct IJSum {
3755: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3756: };
3758: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3759: {
3760: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3761: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3762: THRUSTARRAY *cooPerm_v = NULL;
3763: thrust::device_ptr<const PetscScalar> d_v;
3764: CsrMatrix *matrix;
3765: PetscInt n;
3767: PetscFunctionBegin;
3768: PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIPSPARSE struct");
3769: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIPSPARSE CsrMatrix");
3770: if (!cusp->cooPerm) {
3771: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
3772: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
3773: PetscFunctionReturn(PETSC_SUCCESS);
3774: }
3775: matrix = (CsrMatrix *)cusp->mat->mat;
3776: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3777: if (!v) {
3778: if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3779: goto finalize;
3780: }
3781: n = cusp->cooPerm->size();
3782: if (isHipMem(v)) d_v = thrust::device_pointer_cast(v);
3783: else {
3784: cooPerm_v = new THRUSTARRAY(n);
3785: cooPerm_v->assign(v, v + n);
3786: d_v = cooPerm_v->data();
3787: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
3788: }
3789: PetscCall(PetscLogGpuTimeBegin());
3790: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3791: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3792: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3793: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3794: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3795: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3796: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3797: */
3798: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3799: thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3800: delete cooPerm_w;
3801: } else {
3802: /* all nonzeros in d_v[] are unique entries */
3803: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3804: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3805: thrust::for_each(zibit, zieit, VecHIPPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3806: }
3807: } else {
3808: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3809: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3810: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3811: } else {
3812: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3813: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3814: thrust::for_each(zibit, zieit, VecHIPEquals());
3815: }
3816: }
3817: PetscCall(PetscLogGpuTimeEnd());
3818: finalize:
3819: delete cooPerm_v;
3820: A->offloadmask = PETSC_OFFLOAD_GPU;
3821: PetscCall(PetscObjectStateIncrease((PetscObject)A));
3822: /* shorter version of MatAssemblyEnd_SeqAIJ */
3823: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
3824: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
3825: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
3826: a->reallocs = 0;
3827: A->info.mallocs += 0;
3828: A->info.nz_unneeded = 0;
3829: A->assembled = A->was_assembled = PETSC_TRUE;
3830: A->num_ass++;
3831: PetscFunctionReturn(PETSC_SUCCESS);
3832: }
3834: PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3835: {
3836: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3838: PetscFunctionBegin;
3839: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3840: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3841: if (destroy) {
3842: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3843: delete cusp->csr2csc_i;
3844: cusp->csr2csc_i = NULL;
3845: }
3846: A->transupdated = PETSC_FALSE;
3847: PetscFunctionReturn(PETSC_SUCCESS);
3848: }
3850: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
3851: {
3852: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3853: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3854: PetscInt cooPerm_n, nzr = 0;
3856: PetscFunctionBegin;
3857: PetscCall(PetscLayoutSetUp(A->rmap));
3858: PetscCall(PetscLayoutSetUp(A->cmap));
3859: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3860: if (n != cooPerm_n) {
3861: delete cusp->cooPerm;
3862: delete cusp->cooPerm_a;
3863: cusp->cooPerm = NULL;
3864: cusp->cooPerm_a = NULL;
3865: }
3866: if (n) {
3867: thrust::device_ptr<PetscInt> d_i, d_j;
3868: PetscInt *d_raw_i, *d_raw_j;
3869: PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
3870: PetscMemType imtype, jmtype;
3872: PetscCall(PetscGetMemType(coo_i, &imtype));
3873: if (PetscMemTypeHost(imtype)) {
3874: PetscCallHIP(hipMalloc(&d_raw_i, sizeof(PetscInt) * n));
3875: PetscCallHIP(hipMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, hipMemcpyHostToDevice));
3876: d_i = thrust::device_pointer_cast(d_raw_i);
3877: free_raw_i = PETSC_TRUE;
3878: PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
3879: } else {
3880: d_i = thrust::device_pointer_cast(coo_i);
3881: }
3883: PetscCall(PetscGetMemType(coo_j, &jmtype));
3884: if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJHIPSPARSE_Basic() passes device coo_i[] and host coo_j[]!
3885: PetscCallHIP(hipMalloc(&d_raw_j, sizeof(PetscInt) * n));
3886: PetscCallHIP(hipMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, hipMemcpyHostToDevice));
3887: d_j = thrust::device_pointer_cast(d_raw_j);
3888: free_raw_j = PETSC_TRUE;
3889: PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
3890: } else {
3891: d_j = thrust::device_pointer_cast(coo_j);
3892: }
3894: THRUSTINTARRAY ii(A->rmap->n);
3896: if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
3897: if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
3898: /* Ex.
3899: n = 6
3900: coo_i = [3,3,1,4,1,4]
3901: coo_j = [3,2,2,5,2,6]
3902: */
3903: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
3904: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
3906: PetscCall(PetscLogGpuTimeBegin());
3907: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3908: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3909: (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */
3910: THRUSTINTARRAY w(d_j, d_j + n);
3911: /*
3912: d_i = [1,1,3,3,4,4]
3913: d_j = [2,2,2,3,5,6]
3914: cooPerm = [2,4,1,0,3,5]
3915: */
3916: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3917: /*
3918: d_i = [1,3,3,4,4,x]
3919: ^ekey
3920: d_j = [2,2,3,5,6,x]
3921: ^nekye
3922: */
3923: if (nekey == ekey) { /* all entries are unique */
3924: delete cusp->cooPerm_a;
3925: cusp->cooPerm_a = NULL;
3926: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3927: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3928: adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3929: adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3930: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a hipMemcpy */
3931: w[0] = 0;
3932: thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
3933: thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3934: }
3935: thrust::counting_iterator<PetscInt> search_begin(0);
3936: thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3937: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3938: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3939: PetscCall(PetscLogGpuTimeEnd());
3941: PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
3942: a->singlemalloc = PETSC_FALSE;
3943: a->free_a = PETSC_TRUE;
3944: a->free_ij = PETSC_TRUE;
3945: PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
3946: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3947: PetscCallHIP(hipMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3948: a->nz = a->maxnz = a->i[A->rmap->n];
3949: a->rmax = 0;
3950: PetscCall(PetscMalloc1(a->nz, &a->a));
3951: PetscCall(PetscMalloc1(a->nz, &a->j));
3952: PetscCallHIP(hipMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), hipMemcpyDeviceToHost));
3953: if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
3954: if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
3955: for (PetscInt i = 0; i < A->rmap->n; i++) {
3956: const PetscInt nnzr = a->i[i + 1] - a->i[i];
3957: nzr += (PetscInt) !!(nnzr);
3958: a->ilen[i] = a->imax[i] = nnzr;
3959: a->rmax = PetscMax(a->rmax, nnzr);
3960: }
3961: a->nonzerorowcnt = nzr;
3962: A->preallocated = PETSC_TRUE;
3963: PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
3964: PetscCall(MatMarkDiagonal_SeqAIJ(A));
3965: if (free_raw_i) PetscCallHIP(hipFree(d_raw_i));
3966: if (free_raw_j) PetscCallHIP(hipFree(d_raw_j));
3967: } else PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
3968: PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3969: /* We want to allocate the HIPSPARSE struct for matvec now.
3970: The code is so convoluted now that I prefer to copy zeros */
3971: PetscCall(PetscArrayzero(a->a, a->nz));
3972: PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
3973: A->offloadmask = PETSC_OFFLOAD_CPU;
3974: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3975: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
3976: PetscFunctionReturn(PETSC_SUCCESS);
3977: }
3979: PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3980: {
3981: Mat_SeqAIJ *seq;
3982: Mat_SeqAIJHIPSPARSE *dev;
3983: PetscBool coo_basic = PETSC_TRUE;
3984: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
3986: PetscFunctionBegin;
3987: PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
3988: PetscCall(MatResetPreallocationCOO_SeqAIJHIPSPARSE(mat));
3989: if (coo_i) {
3990: PetscCall(PetscGetMemType(coo_i, &mtype));
3991: if (PetscMemTypeHost(mtype)) {
3992: for (PetscCount k = 0; k < coo_n; k++) {
3993: if (coo_i[k] < 0 || coo_j[k] < 0) {
3994: coo_basic = PETSC_FALSE;
3995: break;
3996: }
3997: }
3998: }
3999: }
4001: if (coo_basic) { /* i,j are on device or do not contain negative indices */
4002: PetscCall(MatSetPreallocationCOO_SeqAIJHIPSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4003: } else {
4004: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4005: mat->offloadmask = PETSC_OFFLOAD_CPU;
4006: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
4007: seq = static_cast<Mat_SeqAIJ *>(mat->data);
4008: dev = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
4009: PetscCallHIP(hipMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
4010: PetscCallHIP(hipMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
4011: PetscCallHIP(hipMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
4012: PetscCallHIP(hipMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
4013: dev->use_extended_coo = PETSC_TRUE;
4014: }
4015: PetscFunctionReturn(PETSC_SUCCESS);
4016: }
4018: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4019: {
4020: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4021: const PetscCount grid_size = gridDim.x * blockDim.x;
4022: for (; i < nnz; i += grid_size) {
4023: PetscScalar sum = 0.0;
4024: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4025: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4026: }
4027: }
4029: PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4030: {
4031: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4032: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4033: PetscCount Annz = seq->nz;
4034: PetscMemType memtype;
4035: const PetscScalar *v1 = v;
4036: PetscScalar *Aa;
4038: PetscFunctionBegin;
4039: if (dev->use_extended_coo) {
4040: PetscCall(PetscGetMemType(v, &memtype));
4041: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4042: PetscCallHIP(hipMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
4043: PetscCallHIP(hipMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), hipMemcpyHostToDevice));
4044: }
4046: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
4047: else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
4049: if (Annz) {
4050: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4051: PetscCallHIP(hipPeekAtLastError());
4052: }
4054: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
4055: else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
4057: if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
4058: } else {
4059: PetscCall(MatSetValuesCOO_SeqAIJHIPSPARSE_Basic(A, v, imode));
4060: }
4061: PetscFunctionReturn(PETSC_SUCCESS);
4062: }
4064: /*@C
4065: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
4067: Not Collective
4069: Input Parameters:
4070: + A - the matrix
4071: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4073: Output Parameters:
4074: + ia - the CSR row pointers
4075: - ja - the CSR column indices
4077: Level: developer
4079: Note:
4080: When compressed is true, the CSR structure does not contain empty rows
4082: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4083: @*/
4084: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4085: {
4086: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4087: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4088: CsrMatrix *csr;
4090: PetscFunctionBegin;
4092: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4093: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4094: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4095: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4096: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4097: csr = (CsrMatrix *)cusp->mat->mat;
4098: if (i) {
4099: if (!compressed && a->compressedrow.use) { /* need full row offset */
4100: if (!cusp->rowoffsets_gpu) {
4101: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4102: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4103: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4104: }
4105: *i = cusp->rowoffsets_gpu->data().get();
4106: } else *i = csr->row_offsets->data().get();
4107: }
4108: if (j) *j = csr->column_indices->data().get();
4109: PetscFunctionReturn(PETSC_SUCCESS);
4110: }
4112: /*@C
4113: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
4115: Not Collective
4117: Input Parameters:
4118: + A - the matrix
4119: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4120: . ia - the CSR row pointers
4121: - ja - the CSR column indices
4123: Level: developer
4125: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
4126: @*/
4127: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4128: {
4129: PetscFunctionBegin;
4131: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4132: if (i) *i = NULL;
4133: if (j) *j = NULL;
4134: PetscFunctionReturn(PETSC_SUCCESS);
4135: }
4137: /*@C
4138: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4140: Not Collective
4142: Input Parameter:
4143: . A - a `MATSEQAIJHIPSPARSE` matrix
4145: Output Parameter:
4146: . a - pointer to the device data
4148: Level: developer
4150: Note:
4151: May trigger host-device copies if the up-to-date matrix data is on host
4153: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
4154: @*/
4155: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4156: {
4157: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4158: CsrMatrix *csr;
4160: PetscFunctionBegin;
4163: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4164: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4165: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4166: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4167: csr = (CsrMatrix *)cusp->mat->mat;
4168: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4169: *a = csr->values->data().get();
4170: PetscFunctionReturn(PETSC_SUCCESS);
4171: }
4173: /*@C
4174: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
4176: Not Collective
4178: Input Parameters:
4179: + A - a `MATSEQAIJHIPSPARSE` matrix
4180: - a - pointer to the device data
4182: Level: developer
4184: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
4185: @*/
4186: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4187: {
4188: PetscFunctionBegin;
4191: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4192: *a = NULL;
4193: PetscFunctionReturn(PETSC_SUCCESS);
4194: }
4196: /*@C
4197: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4199: Not Collective
4201: Input Parameter:
4202: . A - a `MATSEQAIJHIPSPARSE` matrix
4204: Output Parameter:
4205: . a - pointer to the device data
4207: Level: developer
4209: Note:
4210: May trigger host-device copies if up-to-date matrix data is on host
4212: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
4213: @*/
4214: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
4215: {
4216: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4217: CsrMatrix *csr;
4219: PetscFunctionBegin;
4222: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4223: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4224: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4225: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4226: csr = (CsrMatrix *)cusp->mat->mat;
4227: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4228: *a = csr->values->data().get();
4229: A->offloadmask = PETSC_OFFLOAD_GPU;
4230: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4231: PetscFunctionReturn(PETSC_SUCCESS);
4232: }
4233: /*@C
4234: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
4236: Not Collective
4238: Input Parameters:
4239: + A - a `MATSEQAIJHIPSPARSE` matrix
4240: - a - pointer to the device data
4242: Level: developer
4244: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4245: @*/
4246: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4247: {
4248: PetscFunctionBegin;
4251: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4252: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4253: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4254: *a = NULL;
4255: PetscFunctionReturn(PETSC_SUCCESS);
4256: }
4258: /*@C
4259: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4261: Not Collective
4263: Input Parameter:
4264: . A - a `MATSEQAIJHIPSPARSE` matrix
4266: Output Parameter:
4267: . a - pointer to the device data
4269: Level: developer
4271: Note:
4272: Does not trigger host-device copies and flags data validity on the GPU
4274: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4275: @*/
4276: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4277: {
4278: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4279: CsrMatrix *csr;
4281: PetscFunctionBegin;
4284: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4285: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4286: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4287: csr = (CsrMatrix *)cusp->mat->mat;
4288: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4289: *a = csr->values->data().get();
4290: A->offloadmask = PETSC_OFFLOAD_GPU;
4291: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4292: PetscFunctionReturn(PETSC_SUCCESS);
4293: }
4295: /*@C
4296: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4298: Not Collective
4300: Input Parameters:
4301: + A - a `MATSEQAIJHIPSPARSE` matrix
4302: - a - pointer to the device data
4304: Level: developer
4306: .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4307: @*/
4308: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4309: {
4310: PetscFunctionBegin;
4313: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4314: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4315: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4316: *a = NULL;
4317: PetscFunctionReturn(PETSC_SUCCESS);
4318: }
4320: struct IJCompare4 {
4321: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4322: {
4323: if (t1.get<0>() < t2.get<0>()) return true;
4324: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4325: return false;
4326: }
4327: };
4329: struct Shift {
4330: int _shift;
4332: Shift(int shift) : _shift(shift) { }
4333: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4334: };
4336: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4337: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4338: {
4339: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4340: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4341: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4342: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4343: PetscInt Annz, Bnnz;
4344: PetscInt i, m, n, zero = 0;
4346: PetscFunctionBegin;
4350: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4351: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4352: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4353: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4354: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4355: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4356: if (reuse == MAT_INITIAL_MATRIX) {
4357: m = A->rmap->n;
4358: n = A->cmap->n + B->cmap->n;
4359: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4360: PetscCall(MatSetSizes(*C, m, n, m, n));
4361: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4362: c = (Mat_SeqAIJ *)(*C)->data;
4363: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4364: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4365: Ccsr = new CsrMatrix;
4366: Cmat->cprowIndices = NULL;
4367: c->compressedrow.use = PETSC_FALSE;
4368: c->compressedrow.nrows = 0;
4369: c->compressedrow.i = NULL;
4370: c->compressedrow.rindex = NULL;
4371: Ccusp->workVector = NULL;
4372: Ccusp->nrows = m;
4373: Ccusp->mat = Cmat;
4374: Ccusp->mat->mat = Ccsr;
4375: Ccsr->num_rows = m;
4376: Ccsr->num_cols = n;
4377: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4378: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4379: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4380: PetscCallHIP(hipMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
4381: PetscCallHIP(hipMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
4382: PetscCallHIP(hipMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4383: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4384: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4385: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4386: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4387: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4388: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4389: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4391: Acsr = (CsrMatrix *)Acusp->mat->mat;
4392: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4393: Annz = (PetscInt)Acsr->column_indices->size();
4394: Bnnz = (PetscInt)Bcsr->column_indices->size();
4395: c->nz = Annz + Bnnz;
4396: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4397: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4398: Ccsr->values = new THRUSTARRAY(c->nz);
4399: Ccsr->num_entries = c->nz;
4400: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4401: if (c->nz) {
4402: auto Acoo = new THRUSTINTARRAY32(Annz);
4403: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4404: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4405: THRUSTINTARRAY32 *Aroff, *Broff;
4407: if (a->compressedrow.use) { /* need full row offset */
4408: if (!Acusp->rowoffsets_gpu) {
4409: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4410: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4411: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4412: }
4413: Aroff = Acusp->rowoffsets_gpu;
4414: } else Aroff = Acsr->row_offsets;
4415: if (b->compressedrow.use) { /* need full row offset */
4416: if (!Bcusp->rowoffsets_gpu) {
4417: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4418: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4419: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4420: }
4421: Broff = Bcusp->rowoffsets_gpu;
4422: } else Broff = Bcsr->row_offsets;
4423: PetscCall(PetscLogGpuTimeBegin());
4424: PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4425: PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4426: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4427: auto Aperm = thrust::make_constant_iterator(1);
4428: auto Bperm = thrust::make_constant_iterator(0);
4429: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4430: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4431: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4432: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4433: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4434: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4435: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4436: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4437: auto p1 = Ccusp->cooPerm->begin();
4438: auto p2 = Ccusp->cooPerm->begin();
4439: thrust::advance(p2, Annz);
4440: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4441: auto cci = thrust::make_counting_iterator(zero);
4442: auto cce = thrust::make_counting_iterator(c->nz);
4443: #if 0 //Errors on SUMMIT cuda 11.1.0
4444: PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4445: #else
4446: auto pred = thrust::identity<int>();
4447: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4448: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4449: #endif
4450: PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4451: PetscCall(PetscLogGpuTimeEnd());
4452: delete wPerm;
4453: delete Acoo;
4454: delete Bcoo;
4455: delete Ccoo;
4456: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4458: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4459: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4460: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4461: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4462: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4463: CsrMatrix *CcsrT = new CsrMatrix;
4464: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4465: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4467: (*C)->form_explicit_transpose = PETSC_TRUE;
4468: (*C)->transupdated = PETSC_TRUE;
4469: Ccusp->rowoffsets_gpu = NULL;
4470: CmatT->cprowIndices = NULL;
4471: CmatT->mat = CcsrT;
4472: CcsrT->num_rows = n;
4473: CcsrT->num_cols = m;
4474: CcsrT->num_entries = c->nz;
4475: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4476: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4477: CcsrT->values = new THRUSTARRAY(c->nz);
4479: PetscCall(PetscLogGpuTimeBegin());
4480: auto rT = CcsrT->row_offsets->begin();
4481: if (AT) {
4482: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4483: thrust::advance(rT, -1);
4484: }
4485: if (BT) {
4486: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4487: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4488: thrust::copy(titb, tite, rT);
4489: }
4490: auto cT = CcsrT->column_indices->begin();
4491: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4492: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4493: auto vT = CcsrT->values->begin();
4494: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4495: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4496: PetscCall(PetscLogGpuTimeEnd());
4498: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4499: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4500: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4501: PetscCallHIP(hipMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
4502: PetscCallHIP(hipMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
4503: PetscCallHIP(hipMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4504: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4505: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4506: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4508: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4509: Ccusp->matTranspose = CmatT;
4510: }
4511: }
4513: c->singlemalloc = PETSC_FALSE;
4514: c->free_a = PETSC_TRUE;
4515: c->free_ij = PETSC_TRUE;
4516: PetscCall(PetscMalloc1(m + 1, &c->i));
4517: PetscCall(PetscMalloc1(c->nz, &c->j));
4518: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4519: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4520: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4521: ii = *Ccsr->row_offsets;
4522: jj = *Ccsr->column_indices;
4523: PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4524: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4525: } else {
4526: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4527: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4528: }
4529: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4530: PetscCall(PetscMalloc1(m, &c->ilen));
4531: PetscCall(PetscMalloc1(m, &c->imax));
4532: c->maxnz = c->nz;
4533: c->nonzerorowcnt = 0;
4534: c->rmax = 0;
4535: for (i = 0; i < m; i++) {
4536: const PetscInt nn = c->i[i + 1] - c->i[i];
4537: c->ilen[i] = c->imax[i] = nn;
4538: c->nonzerorowcnt += (PetscInt) !!nn;
4539: c->rmax = PetscMax(c->rmax, nn);
4540: }
4541: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4542: PetscCall(PetscMalloc1(c->nz, &c->a));
4543: (*C)->nonzerostate++;
4544: PetscCall(PetscLayoutSetUp((*C)->rmap));
4545: PetscCall(PetscLayoutSetUp((*C)->cmap));
4546: Ccusp->nonzerostate = (*C)->nonzerostate;
4547: (*C)->preallocated = PETSC_TRUE;
4548: } else {
4549: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4550: c = (Mat_SeqAIJ *)(*C)->data;
4551: if (c->nz) {
4552: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4553: PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4554: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4555: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4556: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4557: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4558: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4559: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4560: Acsr = (CsrMatrix *)Acusp->mat->mat;
4561: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4562: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4563: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4564: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4565: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4566: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4567: PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4568: auto pmid = Ccusp->cooPerm->begin();
4569: thrust::advance(pmid, Acsr->num_entries);
4570: PetscCall(PetscLogGpuTimeBegin());
4571: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4572: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4573: thrust::for_each(zibait, zieait, VecHIPEquals());
4574: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4575: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4576: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4577: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4578: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4579: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4580: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4581: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4582: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4583: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4584: auto vT = CcsrT->values->begin();
4585: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4586: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4587: (*C)->transupdated = PETSC_TRUE;
4588: }
4589: PetscCall(PetscLogGpuTimeEnd());
4590: }
4591: }
4592: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4593: (*C)->assembled = PETSC_TRUE;
4594: (*C)->was_assembled = PETSC_FALSE;
4595: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4596: PetscFunctionReturn(PETSC_SUCCESS);
4597: }
4599: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4600: {
4601: bool dmem;
4602: const PetscScalar *av;
4604: PetscFunctionBegin;
4605: dmem = isHipMem(v);
4606: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4607: if (n && idx) {
4608: THRUSTINTARRAY widx(n);
4609: widx.assign(idx, idx + n);
4610: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4612: THRUSTARRAY *w = NULL;
4613: thrust::device_ptr<PetscScalar> dv;
4614: if (dmem) dv = thrust::device_pointer_cast(v);
4615: else {
4616: w = new THRUSTARRAY(n);
4617: dv = w->data();
4618: }
4619: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4621: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4622: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4623: thrust::for_each(zibit, zieit, VecHIPEquals());
4624: if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4625: delete w;
4626: } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4628: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4629: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4630: PetscFunctionReturn(PETSC_SUCCESS);
4631: }