Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7: #include <thrust/advance.h>
8: #include <thrust/partition.h>
9: #include <thrust/sort.h>
10: #include <thrust/unique.h>
11: #include <petscsf.h>
13: struct VecCUDAEquals {
14: template <typename Tuple>
15: __host__ __device__ void operator()(Tuple t)
16: {
17: thrust::get<1>(t) = thrust::get<0>(t);
18: }
19: };
21: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
22: {
23: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
24: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
26: PetscFunctionBegin;
27: if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS);
28: if (cusparseStruct->use_extended_coo) {
29: PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
30: PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
31: PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
32: PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
33: PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
34: PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
35: PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
36: PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
37: PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
38: PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
39: PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
40: PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
41: PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
42: }
43: cusparseStruct->use_extended_coo = PETSC_FALSE;
44: delete cusparseStruct->coo_p;
45: delete cusparseStruct->coo_pw;
46: cusparseStruct->coo_p = NULL;
47: cusparseStruct->coo_pw = NULL;
48: PetscFunctionReturn(PETSC_SUCCESS);
49: }
51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
54: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55: PetscInt n = cusp->coo_nd + cusp->coo_no;
57: PetscFunctionBegin;
58: if (cusp->coo_p && v) {
59: thrust::device_ptr<const PetscScalar> d_v;
60: THRUSTARRAY *w = NULL;
62: if (isCudaMem(v)) {
63: d_v = thrust::device_pointer_cast(v);
64: } else {
65: w = new THRUSTARRAY(n);
66: w->assign(v, v + n);
67: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
68: d_v = w->data();
69: }
71: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
72: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
73: PetscCall(PetscLogGpuTimeBegin());
74: thrust::for_each(zibit, zieit, VecCUDAEquals());
75: PetscCall(PetscLogGpuTimeEnd());
76: delete w;
77: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
78: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
79: } else {
80: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
81: PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
82: }
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: template <typename Tuple>
87: struct IsNotOffDiagT {
88: PetscInt _cstart, _cend;
90: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
91: __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
92: };
94: struct IsOffDiag {
95: PetscInt _cstart, _cend;
97: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
98: __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
99: };
101: struct GlobToLoc {
102: PetscInt _start;
104: GlobToLoc(PetscInt start) : _start(start) { }
105: __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
106: };
108: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
109: {
110: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
111: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
112: PetscInt N, *jj;
113: size_t noff = 0;
114: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
115: THRUSTINTARRAY d_j(n);
116: ISLocalToGlobalMapping l2g;
118: PetscFunctionBegin;
119: PetscCall(MatDestroy(&b->A));
120: PetscCall(MatDestroy(&b->B));
122: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
123: d_i.assign(coo_i, coo_i + n);
124: d_j.assign(coo_j, coo_j + n);
125: PetscCall(PetscLogGpuTimeBegin());
126: auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
127: auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
128: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
129: cusp->coo_p = new THRUSTINTARRAY(n);
130: cusp->coo_pw = new THRUSTARRAY(n);
131: thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
132: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
133: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
134: auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
135: firstoffd = mzipp.get_iterator_tuple().get<1>();
136: }
137: cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
138: cusp->coo_no = thrust::distance(firstoffd, d_j.end());
140: /* from global to local */
141: thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
142: thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
143: PetscCall(PetscLogGpuTimeEnd());
145: /* copy offdiag column indices to map on the CPU */
146: PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
147: PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
148: auto o_j = d_j.begin();
149: PetscCall(PetscLogGpuTimeBegin());
150: thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
151: thrust::sort(thrust::device, o_j, d_j.end());
152: auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
153: PetscCall(PetscLogGpuTimeEnd());
154: noff = thrust::distance(o_j, wit);
155: /* allocate the garray, the columns of B will not be mapped in the multiply setup */
156: PetscCall(PetscMalloc1(noff, &b->garray));
157: PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
158: PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
159: PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
160: PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
161: PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
162: PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
163: PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
165: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
166: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
167: PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
168: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
169: PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
170: PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
172: /* GPU memory, cusparse specific call handles it internally */
173: PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
174: PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
175: PetscCall(PetscFree(jj));
177: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
178: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
180: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
181: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
182: PetscCall(MatSetUpMultiply_MPIAIJ(B));
183: PetscFunctionReturn(PETSC_SUCCESS);
184: }
186: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
187: {
188: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
189: Mat_MPIAIJCUSPARSE *mpidev;
190: PetscBool coo_basic = PETSC_TRUE;
191: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
192: PetscInt rstart, rend;
194: PetscFunctionBegin;
195: PetscCall(PetscFree(mpiaij->garray));
196: PetscCall(VecDestroy(&mpiaij->lvec));
197: #if defined(PETSC_USE_CTABLE)
198: PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
199: #else
200: PetscCall(PetscFree(mpiaij->colmap));
201: #endif
202: PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
203: mat->assembled = PETSC_FALSE;
204: mat->was_assembled = PETSC_FALSE;
205: PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
206: PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
207: if (coo_i) {
208: PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
209: PetscCall(PetscGetMemType(coo_i, &mtype));
210: if (PetscMemTypeHost(mtype)) {
211: for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
212: if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
213: coo_basic = PETSC_FALSE;
214: break;
215: }
216: }
217: }
218: }
219: /* All ranks must agree on the value of coo_basic */
220: PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
221: if (coo_basic) {
222: PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
223: } else {
224: PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
225: mat->offloadmask = PETSC_OFFLOAD_CPU;
226: /* creates the GPU memory */
227: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
228: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
229: mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
230: mpidev->use_extended_coo = PETSC_TRUE;
232: PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
233: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
235: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
236: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
238: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
239: PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
240: PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
242: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
243: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
244: PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
246: PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
247: PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
248: PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
250: PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
251: PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253: PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
254: PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
256: PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257: PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
258: PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
260: PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261: PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
262: PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
264: PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
265: }
266: PetscFunctionReturn(PETSC_SUCCESS);
267: }
269: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
270: {
271: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
272: const PetscCount grid_size = gridDim.x * blockDim.x;
273: for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
274: }
276: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
277: {
278: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
279: const PetscCount grid_size = gridDim.x * blockDim.x;
280: for (; i < Annz + Bnnz; i += grid_size) {
281: PetscScalar sum = 0.0;
282: if (i < Annz) {
283: for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
284: Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
285: } else {
286: i -= Annz;
287: for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
288: Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
289: }
290: }
291: }
293: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
294: {
295: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
296: const PetscCount grid_size = gridDim.x * blockDim.x;
297: for (; i < Annz2 + Bnnz2; i += grid_size) {
298: if (i < Annz2) {
299: for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
300: } else {
301: i -= Annz2;
302: for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
303: }
304: }
305: }
307: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
308: {
309: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
310: Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
311: Mat A = mpiaij->A, B = mpiaij->B;
312: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
313: PetscScalar *Aa, *Ba = NULL;
314: PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
315: const PetscScalar *v1 = v;
316: const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
317: const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
318: const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
319: const PetscCount *Cperm1 = mpidev->Cperm1_d;
320: PetscMemType memtype;
322: PetscFunctionBegin;
323: if (mpidev->use_extended_coo) {
324: PetscMPIInt size;
326: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
327: PetscCall(PetscGetMemType(v, &memtype));
328: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
329: PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
330: PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
331: }
333: if (imode == INSERT_VALUES) {
334: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
335: PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
336: } else {
337: PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
338: PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
339: }
341: /* Pack entries to be sent to remote */
342: if (mpiaij->sendlen) {
343: MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
344: PetscCallCUDA(cudaPeekAtLastError());
345: }
347: /* Send remote entries to their owner and overlap the communication with local computation */
348: PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
349: /* Add local entries to A and B */
350: if (Annz + Bnnz > 0) {
351: MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
352: PetscCallCUDA(cudaPeekAtLastError());
353: }
354: PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
356: /* Add received remote entries to A and B */
357: if (Annz2 + Bnnz2 > 0) {
358: MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
359: PetscCallCUDA(cudaPeekAtLastError());
360: }
362: if (imode == INSERT_VALUES) {
363: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
364: PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
365: } else {
366: PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
367: PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
368: }
369: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
370: } else {
371: PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
372: }
373: mat->offloadmask = PETSC_OFFLOAD_GPU;
374: PetscFunctionReturn(PETSC_SUCCESS);
375: }
377: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
378: {
379: Mat Ad, Ao;
380: const PetscInt *cmap;
382: PetscFunctionBegin;
383: PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
384: PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
385: if (glob) {
386: PetscInt cst, i, dn, on, *gidx;
388: PetscCall(MatGetLocalSize(Ad, NULL, &dn));
389: PetscCall(MatGetLocalSize(Ao, NULL, &on));
390: PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
391: PetscCall(PetscMalloc1(dn + on, &gidx));
392: for (i = 0; i < dn; i++) gidx[i] = cst + i;
393: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
394: PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
395: }
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
400: {
401: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
402: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
403: PetscInt i;
405: PetscFunctionBegin;
406: PetscCall(PetscLayoutSetUp(B->rmap));
407: PetscCall(PetscLayoutSetUp(B->cmap));
408: if (PetscDefined(USE_DEBUG) && d_nnz) {
409: for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
410: }
411: if (PetscDefined(USE_DEBUG) && o_nnz) {
412: for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
413: }
414: #if defined(PETSC_USE_CTABLE)
415: PetscCall(PetscHMapIDestroy(&b->colmap));
416: #else
417: PetscCall(PetscFree(b->colmap));
418: #endif
419: PetscCall(PetscFree(b->garray));
420: PetscCall(VecDestroy(&b->lvec));
421: PetscCall(VecScatterDestroy(&b->Mvctx));
422: /* Because the B will have been resized we simply destroy it and create a new one each time */
423: PetscCall(MatDestroy(&b->B));
424: if (!b->A) {
425: PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
426: PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
427: }
428: if (!b->B) {
429: PetscMPIInt size;
430: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
431: PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
432: PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
433: }
434: PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
435: PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
436: PetscCall(MatBindToCPU(b->A, B->boundtocpu));
437: PetscCall(MatBindToCPU(b->B, B->boundtocpu));
438: PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
439: PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
440: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
441: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
442: B->preallocated = PETSC_TRUE;
443: PetscFunctionReturn(PETSC_SUCCESS);
444: }
446: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
447: {
448: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
450: PetscFunctionBegin;
451: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
452: PetscCall((*a->A->ops->mult)(a->A, xx, yy));
453: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
454: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
455: PetscFunctionReturn(PETSC_SUCCESS);
456: }
458: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
459: {
460: Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
462: PetscFunctionBegin;
463: PetscCall(MatZeroEntries(l->A));
464: PetscCall(MatZeroEntries(l->B));
465: PetscFunctionReturn(PETSC_SUCCESS);
466: }
468: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
469: {
470: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
472: PetscFunctionBegin;
473: PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
474: PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
475: PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
476: PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
477: PetscFunctionReturn(PETSC_SUCCESS);
478: }
480: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
481: {
482: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
484: PetscFunctionBegin;
485: PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
486: PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
487: PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
488: PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
489: PetscFunctionReturn(PETSC_SUCCESS);
490: }
492: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
493: {
494: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
495: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
497: PetscFunctionBegin;
498: switch (op) {
499: case MAT_CUSPARSE_MULT_DIAG:
500: cusparseStruct->diagGPUMatFormat = format;
501: break;
502: case MAT_CUSPARSE_MULT_OFFDIAG:
503: cusparseStruct->offdiagGPUMatFormat = format;
504: break;
505: case MAT_CUSPARSE_ALL:
506: cusparseStruct->diagGPUMatFormat = format;
507: cusparseStruct->offdiagGPUMatFormat = format;
508: break;
509: default:
510: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
511: }
512: PetscFunctionReturn(PETSC_SUCCESS);
513: }
515: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
516: {
517: MatCUSPARSEStorageFormat format;
518: PetscBool flg;
519: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
520: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
522: PetscFunctionBegin;
523: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
524: if (A->factortype == MAT_FACTOR_NONE) {
525: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
526: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
527: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
528: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
529: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
530: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
531: }
532: PetscOptionsHeadEnd();
533: PetscFunctionReturn(PETSC_SUCCESS);
534: }
536: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
537: {
538: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
539: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
540: PetscObjectState onnz = A->nonzerostate;
542: PetscFunctionBegin;
543: PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
544: if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
545: if (onnz != A->nonzerostate && cusp->deviceMat) {
546: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
548: PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
549: PetscCall(PetscNew(&h_mat));
550: PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
551: PetscCallCUDA(cudaFree(h_mat->colmap));
552: if (h_mat->allocated_indices) {
553: PetscCallCUDA(cudaFree(h_mat->diag.i));
554: PetscCallCUDA(cudaFree(h_mat->diag.j));
555: if (h_mat->offdiag.j) {
556: PetscCallCUDA(cudaFree(h_mat->offdiag.i));
557: PetscCallCUDA(cudaFree(h_mat->offdiag.j));
558: }
559: }
560: PetscCallCUDA(cudaFree(d_mat));
561: PetscCall(PetscFree(h_mat));
562: cusp->deviceMat = NULL;
563: }
564: PetscFunctionReturn(PETSC_SUCCESS);
565: }
567: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
568: {
569: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
570: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
572: PetscFunctionBegin;
573: PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
574: if (cusparseStruct->deviceMat) {
575: PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
577: PetscCall(PetscInfo(A, "Have device matrix\n"));
578: PetscCall(PetscNew(&h_mat));
579: PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
580: PetscCallCUDA(cudaFree(h_mat->colmap));
581: if (h_mat->allocated_indices) {
582: PetscCallCUDA(cudaFree(h_mat->diag.i));
583: PetscCallCUDA(cudaFree(h_mat->diag.j));
584: if (h_mat->offdiag.j) {
585: PetscCallCUDA(cudaFree(h_mat->offdiag.i));
586: PetscCallCUDA(cudaFree(h_mat->offdiag.j));
587: }
588: }
589: PetscCallCUDA(cudaFree(d_mat));
590: PetscCall(PetscFree(h_mat));
591: }
592: /* Free COO */
593: PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
594: PetscCallCXX(delete cusparseStruct);
595: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
596: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
597: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
598: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
599: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
600: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
601: PetscCall(MatDestroy_MPIAIJ(A));
602: PetscFunctionReturn(PETSC_SUCCESS);
603: }
605: /* defines MatSetValues_MPICUSPARSE_Hash() */
606: #define TYPE AIJ
607: #define TYPE_AIJ
608: #define SUB_TYPE_CUSPARSE
609: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
610: #undef TYPE
611: #undef TYPE_AIJ
612: #undef SUB_TYPE_CUSPARSE
614: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
615: {
616: Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
617: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
619: PetscFunctionBegin;
620: PetscCall(MatSetUp_MPI_Hash(A));
621: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
622: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
623: A->preallocated = PETSC_TRUE;
624: PetscFunctionReturn(PETSC_SUCCESS);
625: }
627: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
628: {
629: Mat_MPIAIJ *a;
630: Mat A;
632: PetscFunctionBegin;
633: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
634: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
635: else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
636: A = *newmat;
637: A->boundtocpu = PETSC_FALSE;
638: PetscCall(PetscFree(A->defaultvectype));
639: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
641: a = (Mat_MPIAIJ *)A->data;
642: if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
643: if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
644: if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
646: if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
648: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
649: A->ops->mult = MatMult_MPIAIJCUSPARSE;
650: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
651: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
652: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
653: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
654: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
655: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
656: A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
658: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
659: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
660: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
661: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
662: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
663: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
664: #if defined(PETSC_HAVE_HYPRE)
665: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
666: #endif
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
671: {
672: PetscFunctionBegin;
673: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
674: PetscCall(MatCreate_MPIAIJ(A));
675: PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
676: PetscFunctionReturn(PETSC_SUCCESS);
677: }
679: /*@
680: MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
681: (the default parallel PETSc format). This matrix will ultimately pushed down
682: to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
683: assembly performance the user should preallocate the matrix storage by setting
684: the parameter `nz` (or the array nnz).
686: Collective
688: Input Parameters:
689: + comm - MPI communicator, set to `PETSC_COMM_SELF`
690: . m - number of rows
691: . n - number of columns
692: . nz - number of nonzeros per row (same for all rows)
693: - nnz - array containing the number of nonzeros in the various rows
694: (possibly different for each row) or `NULL`
696: Output Parameter:
697: . A - the matrix
699: Level: intermediate
701: Notes:
702: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
703: MatXXXXSetPreallocation() paradigm instead of this routine directly.
704: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
706: If `nnz` is given then `nz` is ignored
708: The AIJ format, also called the
709: compressed row storage), is fully compatible with standard Fortran
710: storage. That is, the stored row and column indices can begin at
711: either one (as in Fortran) or zero.
713: Specify the preallocated storage with either `nz` or `nnz` (not both).
714: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
715: allocation.
717: By default, this format uses inodes (identical nodes) when possible, to
718: improve numerical efficiency of matrix-vector products and solves. We
719: search for consecutive rows with the same nonzero structure, thereby
720: reusing matrix information to achieve increased efficiency.
722: .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
723: @*/
724: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
725: {
726: PetscMPIInt size;
728: PetscFunctionBegin;
729: PetscCall(MatCreate(comm, A));
730: PetscCall(MatSetSizes(*A, m, n, M, N));
731: PetscCallMPI(MPI_Comm_size(comm, &size));
732: if (size > 1) {
733: PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
734: PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
735: } else {
736: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
737: PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
738: }
739: PetscFunctionReturn(PETSC_SUCCESS);
740: }
742: /*MC
743: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
745: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
746: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
747: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
749: This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
750: and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators,
751: `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
752: for communicators controlling multiple processes. It is recommended that you call both of
753: the above preallocation routines for simplicity.
755: Options Database Keys:
756: + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
757: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
758: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
759: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
761: Level: beginner
763: .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
764: M*/
766: /*MC
767: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
769: Level: beginner
771: .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
772: M*/
774: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
775: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
776: {
777: PetscSplitCSRDataStructure d_mat;
778: PetscMPIInt size;
779: int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
780: PetscScalar *aa = NULL, *ba = NULL;
781: Mat_SeqAIJ *jaca = NULL, *jacb = NULL;
782: Mat_SeqAIJCUSPARSE *cusparsestructA = NULL;
783: CsrMatrix *matrixA = NULL, *matrixB = NULL;
785: PetscFunctionBegin;
786: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
787: if (A->factortype != MAT_FACTOR_NONE) {
788: *B = NULL;
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
791: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
792: // get jaca
793: if (size == 1) {
794: PetscBool isseqaij;
796: PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
797: if (isseqaij) {
798: jaca = (Mat_SeqAIJ *)A->data;
799: PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
800: cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
801: d_mat = cusparsestructA->deviceMat;
802: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
803: } else {
804: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
805: PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
806: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
807: jaca = (Mat_SeqAIJ *)aij->A->data;
808: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
809: d_mat = spptr->deviceMat;
810: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
811: }
812: if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
813: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
814: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
815: matrixA = (CsrMatrix *)matstruct->mat;
816: bi = NULL;
817: bj = NULL;
818: ba = NULL;
819: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
820: } else {
821: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
822: PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
823: jaca = (Mat_SeqAIJ *)aij->A->data;
824: jacb = (Mat_SeqAIJ *)aij->B->data;
825: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
827: PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
828: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
829: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
830: PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
831: if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
832: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
833: PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
834: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
835: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
836: PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
837: PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
838: matrixA = (CsrMatrix *)matstructA->mat;
839: matrixB = (CsrMatrix *)matstructB->mat;
840: if (jacb->compressedrow.use) {
841: if (!cusparsestructB->rowoffsets_gpu) {
842: cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
843: cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
844: }
845: bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
846: } else {
847: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
848: }
849: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
850: ba = thrust::raw_pointer_cast(matrixB->values->data());
851: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
852: d_mat = spptr->deviceMat;
853: }
854: if (jaca->compressedrow.use) {
855: if (!cusparsestructA->rowoffsets_gpu) {
856: cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
857: cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
858: }
859: ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
860: } else {
861: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
862: }
863: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
864: aa = thrust::raw_pointer_cast(matrixA->values->data());
866: if (!d_mat) {
867: PetscSplitCSRDataStructure h_mat;
869: // create and populate strucy on host and copy on device
870: PetscCall(PetscInfo(A, "Create device matrix\n"));
871: PetscCall(PetscNew(&h_mat));
872: PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
873: if (size > 1) { /* need the colmap array */
874: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
875: PetscInt *colmap;
876: PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N;
878: PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
880: PetscCall(PetscCalloc1(N + 1, &colmap));
881: for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
882: #if defined(PETSC_USE_64BIT_INDICES)
883: { // have to make a long version of these
884: int *h_bi32, *h_bj32;
885: PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
886: PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
887: PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
888: for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
889: PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
890: for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
892: PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
893: PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
894: PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
895: PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
897: h_mat->offdiag.i = d_bi64;
898: h_mat->offdiag.j = d_bj64;
899: h_mat->allocated_indices = PETSC_TRUE;
901: PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
902: }
903: #else
904: h_mat->offdiag.i = (PetscInt *)bi;
905: h_mat->offdiag.j = (PetscInt *)bj;
906: h_mat->allocated_indices = PETSC_FALSE;
907: #endif
908: h_mat->offdiag.a = ba;
909: h_mat->offdiag.n = A->rmap->n;
911: PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
912: PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
913: PetscCall(PetscFree(colmap));
914: }
915: h_mat->rstart = A->rmap->rstart;
916: h_mat->rend = A->rmap->rend;
917: h_mat->cstart = A->cmap->rstart;
918: h_mat->cend = A->cmap->rend;
919: h_mat->M = A->cmap->N;
920: #if defined(PETSC_USE_64BIT_INDICES)
921: {
922: int *h_ai32, *h_aj32;
923: PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
925: static_assert(sizeof(PetscInt) == 8, "");
926: PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
927: PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
928: for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
929: PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
930: for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
932: PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
933: PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
934: PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
935: PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
937: h_mat->diag.i = d_ai64;
938: h_mat->diag.j = d_aj64;
939: h_mat->allocated_indices = PETSC_TRUE;
941: PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
942: }
943: #else
944: h_mat->diag.i = (PetscInt *)ai;
945: h_mat->diag.j = (PetscInt *)aj;
946: h_mat->allocated_indices = PETSC_FALSE;
947: #endif
948: h_mat->diag.a = aa;
949: h_mat->diag.n = A->rmap->n;
950: h_mat->rank = PetscGlobalRank;
951: // copy pointers and metadata to device
952: PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
953: PetscCall(PetscFree(h_mat));
954: } else {
955: PetscCall(PetscInfo(A, "Reusing device matrix\n"));
956: }
957: *B = d_mat;
958: A->offloadmask = PETSC_OFFLOAD_GPU;
959: PetscFunctionReturn(PETSC_SUCCESS);
960: }