Actual source code: mpiaijcusp.cu
petsc-dev 2014-02-02
1: #include <petscconf.h>
2: PETSC_CUDA_EXTERN_C_BEGIN
3: #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
4: PETSC_CUDA_EXTERN_C_END
5: #include <mpicuspmatimpl.h>
9: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSP(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
10: {
11: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)b->spptr;
14: PetscInt i;
17: PetscLayoutSetUp(B->rmap);
18: PetscLayoutSetUp(B->cmap);
19: if (d_nnz) {
20: for (i=0; i<B->rmap->n; i++) {
21: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
22: }
23: }
24: if (o_nnz) {
25: for (i=0; i<B->rmap->n; i++) {
26: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
27: }
28: }
29: if (!B->preallocated) {
30: /* Explicitly create 2 MATSEQAIJCUSP matrices. */
31: MatCreate(PETSC_COMM_SELF,&b->A);
32: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
33: MatSetType(b->A,MATSEQAIJCUSP);
34: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
35: MatCreate(PETSC_COMM_SELF,&b->B);
36: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
37: MatSetType(b->B,MATSEQAIJCUSP);
38: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
39: }
40: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
41: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
42: MatCUSPSetFormat(b->A,MAT_CUSP_MULT,cuspStruct->diagGPUMatFormat);
43: MatCUSPSetFormat(b->B,MAT_CUSP_MULT,cuspStruct->offdiagGPUMatFormat);
44: MatCUSPSetStream(b->A,cuspStruct->stream);
45: MatCUSPSetStream(b->B,cuspStruct->stream);
46: B->preallocated = PETSC_TRUE;
47: return(0);
48: }
52: PetscErrorCode MatGetVecs_MPIAIJCUSP(Mat mat,Vec *right,Vec *left)
53: {
57: if (right) {
58: VecCreate(PetscObjectComm((PetscObject)mat),right);
59: VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
60: VecSetBlockSize(*right,mat->rmap->bs);
61: VecSetType(*right,VECCUSP);
62: VecSetLayout(*right,mat->cmap);
63: }
64: if (left) {
65: VecCreate(PetscObjectComm((PetscObject)mat),left);
66: VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
67: VecSetBlockSize(*left,mat->rmap->bs);
68: VecSetType(*left,VECCUSP);
69: VecSetLayout(*left,mat->rmap);
70: }
71: return(0);
72: }
77: PetscErrorCode MatMult_MPIAIJCUSP(Mat A,Vec xx,Vec yy)
78: {
79: /* This multiplication sequence is different sequence
80: than the CPU version. In particular, the diagonal block
81: multiplication kernel is launched in one stream. Then,
82: in a separate stream, the data transfers from DeviceToHost
83: (with MPI messaging in between), then HostToDevice are
84: launched. Once the data transfer stream is synchronized,
85: to ensure messaging is complete, the MatMultAdd kernel
86: is launched in the original (MatMult) stream to protect
87: against race conditions.
89: This sequence should only be called for GPU computation. */
90: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
92: PetscInt nt;
95: VecGetLocalSize(xx,&nt);
96: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
97: VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
98: (*a->A->ops->mult)(a->A,xx,yy);
99: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
100: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
101: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
102: VecScatterFinalizeForGPU(a->Mvctx);
103: return(0);
104: }
106: PetscErrorCode MatSetValuesBatch_MPIAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);
110: PetscErrorCode MatCUSPSetFormat_MPIAIJCUSP(Mat A,MatCUSPFormatOperation op,MatCUSPStorageFormat format)
111: {
112: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
113: Mat_MPIAIJCUSP * cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
116: switch (op) {
117: case MAT_CUSP_MULT_DIAG:
118: cuspStruct->diagGPUMatFormat = format;
119: break;
120: case MAT_CUSP_MULT_OFFDIAG:
121: cuspStruct->offdiagGPUMatFormat = format;
122: break;
123: case MAT_CUSP_ALL:
124: cuspStruct->diagGPUMatFormat = format;
125: cuspStruct->offdiagGPUMatFormat = format;
126: break;
127: default:
128: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPFormatOperation. Only MAT_CUSP_MULT_DIAG, MAT_CUSP_MULT_DIAG, and MAT_CUSP_MULT_ALL are currently supported.",op);
129: }
130: return(0);
131: }
135: PetscErrorCode MatSetFromOptions_MPIAIJCUSP(Mat A)
136: {
137: MatCUSPStorageFormat format;
138: PetscErrorCode ierr;
139: PetscBool flg;
142: PetscOptionsHead("MPIAIJCUSP options");
143: PetscObjectOptionsBegin((PetscObject)A);
144: if (A->factortype==MAT_FACTOR_NONE) {
145: PetscOptionsEnum("-mat_cusp_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusp gpu matrices for SpMV",
146: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
147: if (flg) {
148: MatCUSPSetFormat(A,MAT_CUSP_MULT_DIAG,format);
149: }
150: PetscOptionsEnum("-mat_cusp_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
151: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
152: if (flg) {
153: MatCUSPSetFormat(A,MAT_CUSP_MULT_OFFDIAG,format);
154: }
155: PetscOptionsEnum("-mat_cusp_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusp gpu matrices for SpMV",
156: "MatCUSPSetFormat",MatCUSPStorageFormats,(PetscEnum)MAT_CUSP_CSR,(PetscEnum*)&format,&flg);
157: if (flg) {
158: MatCUSPSetFormat(A,MAT_CUSP_ALL,format);
159: }
160: }
161: PetscOptionsEnd();
162: return(0);
163: }
167: PetscErrorCode MatDestroy_MPIAIJCUSP(Mat A)
168: {
170: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
171: Mat_MPIAIJCUSP *cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
172: cudaError_t err=cudaSuccess;
175: try {
176: err = cudaStreamDestroy(cuspStruct->stream);
177: if (err!=cudaSuccess)
178: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
179: delete cuspStruct;
180: } catch(char *ex) {
181: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", ex);
182: }
183: cuspStruct = 0;
184: MatDestroy_MPIAIJ(A);
185: return(0);
186: }
190: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSP(Mat A)
191: {
193: Mat_MPIAIJ *a;
194: Mat_MPIAIJCUSP * cuspStruct;
195: cudaError_t err=cudaSuccess;
198: MatCreate_MPIAIJ(A);
199: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSP);
200: A->ops->getvecs = MatGetVecs_MPIAIJCUSP;
201: A->ops->setvaluesbatch = MatSetValuesBatch_MPIAIJCUSP;
203: a = (Mat_MPIAIJ*)A->data;
204: a->spptr = new Mat_MPIAIJCUSP;
205: cuspStruct = (Mat_MPIAIJCUSP*)a->spptr;
207: cuspStruct->diagGPUMatFormat = MAT_CUSP_CSR;
208: cuspStruct->offdiagGPUMatFormat = MAT_CUSP_CSR;
209: err = cudaStreamCreate(&(cuspStruct->stream));
210: if (err!=cudaSuccess)
211: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSP error: %s", cudaGetErrorString(err));
213: A->ops->mult = MatMult_MPIAIJCUSP;
214: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSP;
215: A->ops->destroy = MatDestroy_MPIAIJCUSP;
217: PetscObjectComposeFunction((PetscObject)A,"MatCUSPSetFormat_C", MatCUSPSetFormat_MPIAIJCUSP);
218: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSP);
219: return(0);
220: }
223: /*@
224: MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
225: (the default parallel PETSc format). This matrix will ultimately pushed down
226: to NVidia GPUs and use the CUSP library for calculations. For good matrix
227: assembly performance the user should preallocate the matrix storage by setting
228: the parameter nz (or the array nnz). By setting these parameters accurately,
229: performance during matrix assembly can be increased by more than a factor of 50.
232: Collective on MPI_Comm
234: Input Parameters:
235: + comm - MPI communicator, set to PETSC_COMM_SELF
236: . m - number of rows
237: . n - number of columns
238: . nz - number of nonzeros per row (same for all rows)
239: - nnz - array containing the number of nonzeros in the various rows
240: (possibly different for each row) or NULL
242: Output Parameter:
243: . A - the matrix
245: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
246: MatXXXXSetPreallocation() paradigm instead of this routine directly.
247: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
249: Notes:
250: If nnz is given then nz is ignored
252: The AIJ format (also called the Yale sparse matrix format or
253: compressed row storage), is fully compatible with standard Fortran 77
254: storage. That is, the stored row and column indices can begin at
255: either one (as in Fortran) or zero. See the users' manual for details.
257: Specify the preallocated storage with either nz or nnz (not both).
258: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
259: allocation. For large problems you MUST preallocate memory or you
260: will get TERRIBLE performance, see the users' manual chapter on matrices.
262: By default, this format uses inodes (identical nodes) when possible, to
263: improve numerical efficiency of matrix-vector products and solves. We
264: search for consecutive rows with the same nonzero structure, thereby
265: reusing matrix information to achieve increased efficiency.
267: Level: intermediate
269: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSP, MATAIJCUSP
270: @*/
273: PetscErrorCode MatCreateAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
274: {
276: PetscMPIInt size;
279: MatCreate(comm,A);
280: MatSetSizes(*A,m,n,M,N);
281: MPI_Comm_size(comm,&size);
282: if (size > 1) {
283: MatSetType(*A,MATMPIAIJCUSP);
284: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
285: } else {
286: MatSetType(*A,MATSEQAIJCUSP);
287: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
288: }
289: return(0);
290: }
292: /*M
293: MATAIJCUSP - MATMPIAIJCUSP= "aijcusp" = "mpiaijcusp" - A matrix type to be used for sparse matrices.
295: A matrix type type whose data resides on Nvidia GPUs. These matrices can be CSR format.
296: All matrix calculations are performed using the CUSP library. DIA and ELL
297: formats are also available
299: This matrix type is identical to MATSEQAIJCUSP when constructed with a single process communicator,
300: and MATMPIAIJCUSP otherwise. As a result, for single process communicators,
301: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
302: for communicators controlling multiple processes. It is recommended that you call both of
303: the above preallocation routines for simplicity.
305: Options Database Keys:
306: + -mat_type mpiaijcusp - sets the matrix type to "mpiaijcusp" during a call to MatSetFromOptions()
307: . -mat_cusp_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
308: . -mat_cusp_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
309: - -mat_cusp_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other storage formats include dia (diagonal) or ell (ellpack).
311: Level: beginner
313: .seealso: MatCreateAIJCUSP(), MATSEQAIJCUSP, MatCreateSeqAIJCUSP(), MatCUSPSetFormat(), MatCUSPStorageFormat, MatCUSPFormatOperation
314: M*/