Actual source code: mpimatmatmult.c
petsc-dev 2014-02-02
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc-private/vecimpl.h>
15: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
18: const char *algTypes[2] = {"scalable","nonscalable"};
19: PetscInt alg=0; /* set default algorithm */
22: if (scall == MAT_INITIAL_MATRIX) {
23: PetscObjectOptionsBegin((PetscObject)A);
24: PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);
25: PetscOptionsEnd();
27: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
28: switch (alg) {
29: case 1:
30: MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);
31: break;
32: default:
33: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
34: break;
35: }
36: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
37: }
38: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
39: (*(*C)->ops->matmultnumeric)(A,B,*C);
40: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
41: return(0);
42: }
46: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
47: {
49: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
50: Mat_PtAPMPI *ptap = a->ptap;
53: PetscFree2(ptap->startsj_s,ptap->startsj_r);
54: PetscFree(ptap->bufa);
55: MatDestroy(&ptap->P_loc);
56: MatDestroy(&ptap->P_oth);
57: MatDestroy(&ptap->Pt);
58: PetscFree(ptap->api);
59: PetscFree(ptap->apj);
60: PetscFree(ptap->apa);
61: ptap->destroy(A);
62: PetscFree(ptap);
63: return(0);
64: }
68: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
69: {
71: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
72: Mat_PtAPMPI *ptap = a->ptap;
75: (*ptap->duplicate)(A,op,M);
77: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
78: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
79: return(0);
80: }
84: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
85: {
87: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
88: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
89: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
90: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
91: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
92: Mat_SeqAIJ *p_loc,*p_oth;
93: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
94: PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
95: PetscInt cm =C->rmap->n,anz,pnz;
96: Mat_PtAPMPI *ptap=c->ptap;
97: PetscInt *api,*apj,*apJ,i,j,k,row;
98: PetscInt cstart=C->cmap->rstart;
99: PetscInt cdnz,conz,k0,k1;
102: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
103: /*-----------------------------------------------------*/
104: /* update numerical values of P_oth and P_loc */
105: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
106: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
108: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
109: /*----------------------------------------------------------*/
110: /* get data from symbolic products */
111: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
112: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
113: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
114: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
116: /* get apa for storing dense row A[i,:]*P */
117: apa = ptap->apa;
119: api = ptap->api;
120: apj = ptap->apj;
121: for (i=0; i<cm; i++) {
122: /* diagonal portion of A */
123: anz = adi[i+1] - adi[i];
124: adj = ad->j + adi[i];
125: ada = ad->a + adi[i];
126: for (j=0; j<anz; j++) {
127: row = adj[j];
128: pnz = pi_loc[row+1] - pi_loc[row];
129: pj = pj_loc + pi_loc[row];
130: pa = pa_loc + pi_loc[row];
132: /* perform dense axpy */
133: valtmp = ada[j];
134: for (k=0; k<pnz; k++) {
135: apa[pj[k]] += valtmp*pa[k];
136: }
137: PetscLogFlops(2.0*pnz);
138: }
140: /* off-diagonal portion of A */
141: anz = aoi[i+1] - aoi[i];
142: aoj = ao->j + aoi[i];
143: aoa = ao->a + aoi[i];
144: for (j=0; j<anz; j++) {
145: row = aoj[j];
146: pnz = pi_oth[row+1] - pi_oth[row];
147: pj = pj_oth + pi_oth[row];
148: pa = pa_oth + pi_oth[row];
150: /* perform dense axpy */
151: valtmp = aoa[j];
152: for (k=0; k<pnz; k++) {
153: apa[pj[k]] += valtmp*pa[k];
154: }
155: PetscLogFlops(2.0*pnz);
156: }
158: /* set values in C */
159: apJ = apj + api[i];
160: cdnz = cd->i[i+1] - cd->i[i];
161: conz = co->i[i+1] - co->i[i];
163: /* 1st off-diagoanl part of C */
164: ca = coa + co->i[i];
165: k = 0;
166: for (k0=0; k0<conz; k0++) {
167: if (apJ[k] >= cstart) break;
168: ca[k0] = apa[apJ[k]];
169: apa[apJ[k]] = 0.0;
170: k++;
171: }
173: /* diagonal part of C */
174: ca = cda + cd->i[i];
175: for (k1=0; k1<cdnz; k1++) {
176: ca[k1] = apa[apJ[k]];
177: apa[apJ[k]] = 0.0;
178: k++;
179: }
181: /* 2nd off-diagoanl part of C */
182: ca = coa + co->i[i];
183: for (; k0<conz; k0++) {
184: ca[k0] = apa[apJ[k]];
185: apa[apJ[k]] = 0.0;
186: k++;
187: }
188: }
189: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
190: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
191: return(0);
192: }
196: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C)
197: {
198: PetscErrorCode ierr;
199: MPI_Comm comm;
200: Mat Cmpi;
201: Mat_PtAPMPI *ptap;
202: PetscFreeSpaceList free_space=NULL,current_space=NULL;
203: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
204: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
205: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
206: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
207: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
208: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
209: PetscBT lnkbt;
210: PetscScalar *apa;
211: PetscReal afill;
212: PetscInt nlnk_max,armax,prmax;
215: PetscObjectGetComm((PetscObject)A,&comm);
216: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
217: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
218: }
219:
220: /* create struct Mat_PtAPMPI and attached it to C later */
221: PetscNew(&ptap);
223: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
224: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
226: /* get P_loc by taking all local rows of P */
227: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
229: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
230: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
231: pi_loc = p_loc->i; pj_loc = p_loc->j;
232: pi_oth = p_oth->i; pj_oth = p_oth->j;
234: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
235: /*-------------------------------------------------------------------*/
236: PetscMalloc1((am+2),&api);
237: ptap->api = api;
238: api[0] = 0;
240: /* create and initialize a linked list */
241: armax = ad->rmax+ao->rmax;
242: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
243: nlnk_max = armax*prmax;
244: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
245: PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);
247: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
248: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
250: current_space = free_space;
252: MatPreallocateInitialize(comm,am,pn,dnz,onz);
253: for (i=0; i<am; i++) {
254: /* diagonal portion of A */
255: nzi = adi[i+1] - adi[i];
256: for (j=0; j<nzi; j++) {
257: row = *adj++;
258: pnz = pi_loc[row+1] - pi_loc[row];
259: Jptr = pj_loc + pi_loc[row];
260: /* add non-zero cols of P into the sorted linked list lnk */
261: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
262: }
263: /* off-diagonal portion of A */
264: nzi = aoi[i+1] - aoi[i];
265: for (j=0; j<nzi; j++) {
266: row = *aoj++;
267: pnz = pi_oth[row+1] - pi_oth[row];
268: Jptr = pj_oth + pi_oth[row];
269: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
270: }
272: apnz = lnk[0];
273: api[i+1] = api[i] + apnz;
275: /* if free space is not available, double the total space in the list */
276: if (current_space->local_remaining<apnz) {
277: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
278: nspacedouble++;
279: }
281: /* Copy data into free space, then initialize lnk */
282: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
283: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
285: current_space->array += apnz;
286: current_space->local_used += apnz;
287: current_space->local_remaining -= apnz;
288: }
290: /* Allocate space for apj, initialize apj, and */
291: /* destroy list of free space and other temporary array(s) */
292: PetscMalloc1((api[am]+1),&ptap->apj);
293: apj = ptap->apj;
294: PetscFreeSpaceContiguous(&free_space,ptap->apj);
295: PetscLLDestroy(lnk,lnkbt);
297: /* malloc apa to store dense row A[i,:]*P */
298: PetscCalloc1(pN,&apa);
300: ptap->apa = apa;
302: /* create and assemble symbolic parallel matrix Cmpi */
303: /*----------------------------------------------------*/
304: MatCreate(comm,&Cmpi);
305: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
306: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
308: MatSetType(Cmpi,MATMPIAIJ);
309: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
310: MatPreallocateFinalize(dnz,onz);
311: for (i=0; i<am; i++) {
312: row = i + rstart;
313: apnz = api[i+1] - api[i];
314: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
315: apj += apnz;
316: }
317: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
318: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
320: ptap->destroy = Cmpi->ops->destroy;
321: ptap->duplicate = Cmpi->ops->duplicate;
322: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
323: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
325: /* attach the supporting struct to Cmpi for reuse */
326: c = (Mat_MPIAIJ*)Cmpi->data;
327: c->ptap = ptap;
329: *C = Cmpi;
331: /* set MatInfo */
332: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
333: if (afill < 1.0) afill = 1.0;
334: Cmpi->info.mallocs = nspacedouble;
335: Cmpi->info.fill_ratio_given = fill;
336: Cmpi->info.fill_ratio_needed = afill;
338: #if defined(PETSC_USE_INFO)
339: if (api[am]) {
340: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
341: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
342: } else {
343: PetscInfo(Cmpi,"Empty matrix product\n");
344: }
345: #endif
346: return(0);
347: }
351: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
352: {
356: if (scall == MAT_INITIAL_MATRIX) {
357: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
358: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
359: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
360: }
361: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
362: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
363: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
364: return(0);
365: }
367: typedef struct {
368: Mat workB;
369: PetscScalar *rvalues,*svalues;
370: MPI_Request *rwaits,*swaits;
371: } MPIAIJ_MPIDense;
375: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
376: {
377: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
378: PetscErrorCode ierr;
381: MatDestroy(&contents->workB);
382: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
383: PetscFree(contents);
384: return(0);
385: }
389: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
390: {
391: PetscErrorCode ierr;
392: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
393: PetscInt nz = aij->B->cmap->n;
394: PetscContainer container;
395: MPIAIJ_MPIDense *contents;
396: VecScatter ctx = aij->Mvctx;
397: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
398: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
399: PetscInt m = A->rmap->n,n=B->cmap->n;
402: MatCreate(PetscObjectComm((PetscObject)B),C);
403: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
404: MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);
405: MatSetType(*C,MATMPIDENSE);
406: MatMPIDenseSetPreallocation(*C,NULL);
407: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
408: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
410: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
412: PetscNew(&contents);
413: /* Create work matrix used to store off processor rows of B needed for local product */
414: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
415: /* Create work arrays needed */
416: PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues,
417: B->cmap->N*to->starts[to->n],&contents->svalues,
418: from->n,&contents->rwaits,
419: to->n,&contents->swaits);
421: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
422: PetscContainerSetPointer(container,contents);
423: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
424: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
425: PetscContainerDestroy(&container);
426: return(0);
427: }
431: /*
432: Performs an efficient scatter on the rows of B needed by this process; this is
433: a modification of the VecScatterBegin_() routines.
434: */
435: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
436: {
437: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
438: PetscErrorCode ierr;
439: PetscScalar *b,*w,*svalues,*rvalues;
440: VecScatter ctx = aij->Mvctx;
441: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
442: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
443: PetscInt i,j,k;
444: PetscInt *sindices,*sstarts,*rindices,*rstarts;
445: PetscMPIInt *sprocs,*rprocs,nrecvs;
446: MPI_Request *swaits,*rwaits;
447: MPI_Comm comm;
448: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
449: MPI_Status status;
450: MPIAIJ_MPIDense *contents;
451: PetscContainer container;
452: Mat workB;
455: PetscObjectGetComm((PetscObject)A,&comm);
456: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
457: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
458: PetscContainerGetPointer(container,(void**)&contents);
460: workB = *outworkB = contents->workB;
461: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
462: sindices = to->indices;
463: sstarts = to->starts;
464: sprocs = to->procs;
465: swaits = contents->swaits;
466: svalues = contents->svalues;
468: rindices = from->indices;
469: rstarts = from->starts;
470: rprocs = from->procs;
471: rwaits = contents->rwaits;
472: rvalues = contents->rvalues;
474: MatDenseGetArray(B,&b);
475: MatDenseGetArray(workB,&w);
477: for (i=0; i<from->n; i++) {
478: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
479: }
481: for (i=0; i<to->n; i++) {
482: /* pack a message at a time */
483: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
484: for (k=0; k<ncols; k++) {
485: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
486: }
487: }
488: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
489: }
491: nrecvs = from->n;
492: while (nrecvs) {
493: MPI_Waitany(from->n,rwaits,&imdex,&status);
494: nrecvs--;
495: /* unpack a message at a time */
496: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
497: for (k=0; k<ncols; k++) {
498: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
499: }
500: }
501: }
502: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
504: MatDenseRestoreArray(B,&b);
505: MatDenseRestoreArray(workB,&w);
506: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
507: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
508: return(0);
509: }
510: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
514: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
515: {
517: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
518: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
519: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
520: Mat workB;
523: /* diagonal block of A times all local rows of B*/
524: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
526: /* get off processor parts of B needed to complete the product */
527: MatMPIDenseScatter(A,B,C,&workB);
529: /* off-diagonal block of A times nonlocal rows of B */
530: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
531: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
532: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
533: return(0);
534: }
538: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
539: {
541: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
542: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
543: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
544: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
545: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
546: Mat_SeqAIJ *p_loc,*p_oth;
547: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
548: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
549: PetscInt cm = C->rmap->n,anz,pnz;
550: Mat_PtAPMPI *ptap = c->ptap;
551: PetscScalar *apa_sparse = ptap->apa;
552: PetscInt *api,*apj,*apJ,i,j,k,row;
553: PetscInt cstart = C->cmap->rstart;
554: PetscInt cdnz,conz,k0,k1,nextp;
557: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
558: /*-----------------------------------------------------*/
559: /* update numerical values of P_oth and P_loc */
560: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
561: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
563: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
564: /*----------------------------------------------------------*/
565: /* get data from symbolic products */
566: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
567: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
568: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
569: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
571: api = ptap->api;
572: apj = ptap->apj;
573: for (i=0; i<cm; i++) {
574: apJ = apj + api[i];
576: /* diagonal portion of A */
577: anz = adi[i+1] - adi[i];
578: adj = ad->j + adi[i];
579: ada = ad->a + adi[i];
580: for (j=0; j<anz; j++) {
581: row = adj[j];
582: pnz = pi_loc[row+1] - pi_loc[row];
583: pj = pj_loc + pi_loc[row];
584: pa = pa_loc + pi_loc[row];
585: /* perform sparse axpy */
586: valtmp = ada[j];
587: nextp = 0;
588: for (k=0; nextp<pnz; k++) {
589: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
590: apa_sparse[k] += valtmp*pa[nextp++];
591: }
592: }
593: PetscLogFlops(2.0*pnz);
594: }
596: /* off-diagonal portion of A */
597: anz = aoi[i+1] - aoi[i];
598: aoj = ao->j + aoi[i];
599: aoa = ao->a + aoi[i];
600: for (j=0; j<anz; j++) {
601: row = aoj[j];
602: pnz = pi_oth[row+1] - pi_oth[row];
603: pj = pj_oth + pi_oth[row];
604: pa = pa_oth + pi_oth[row];
605: /* perform sparse axpy */
606: valtmp = aoa[j];
607: nextp = 0;
608: for (k=0; nextp<pnz; k++) {
609: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
610: apa_sparse[k] += valtmp*pa[nextp++];
611: }
612: }
613: PetscLogFlops(2.0*pnz);
614: }
616: /* set values in C */
617: cdnz = cd->i[i+1] - cd->i[i];
618: conz = co->i[i+1] - co->i[i];
620: /* 1st off-diagoanl part of C */
621: ca = coa + co->i[i];
622: k = 0;
623: for (k0=0; k0<conz; k0++) {
624: if (apJ[k] >= cstart) break;
625: ca[k0] = apa_sparse[k];
626: apa_sparse[k] = 0.0;
627: k++;
628: }
630: /* diagonal part of C */
631: ca = cda + cd->i[i];
632: for (k1=0; k1<cdnz; k1++) {
633: ca[k1] = apa_sparse[k];
634: apa_sparse[k] = 0.0;
635: k++;
636: }
638: /* 2nd off-diagoanl part of C */
639: ca = coa + co->i[i];
640: for (; k0<conz; k0++) {
641: ca[k0] = apa_sparse[k];
642: apa_sparse[k] = 0.0;
643: k++;
644: }
645: }
646: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
647: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
648: return(0);
649: }
651: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */
654: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
655: {
656: PetscErrorCode ierr;
657: MPI_Comm comm;
658: Mat Cmpi;
659: Mat_PtAPMPI *ptap;
660: PetscFreeSpaceList free_space = NULL,current_space=NULL;
661: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
662: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
663: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
664: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
665: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
666: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
667: PetscInt nlnk_max,armax,prmax;
668: PetscReal afill;
669: PetscScalar *apa;
672: PetscObjectGetComm((PetscObject)A,&comm);
673: /* create struct Mat_PtAPMPI and attached it to C later */
674: PetscNew(&ptap);
676: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
677: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
679: /* get P_loc by taking all local rows of P */
680: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
682: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
683: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
684: pi_loc = p_loc->i; pj_loc = p_loc->j;
685: pi_oth = p_oth->i; pj_oth = p_oth->j;
687: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
688: /*-------------------------------------------------------------------*/
689: PetscMalloc1((am+2),&api);
690: ptap->api = api;
691: api[0] = 0;
693: /* create and initialize a linked list */
694: armax = ad->rmax+ao->rmax;
695: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
696: nlnk_max = armax*prmax;
697: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
698: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
700: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
701: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
703: current_space = free_space;
705: MatPreallocateInitialize(comm,am,pn,dnz,onz);
706: for (i=0; i<am; i++) {
707: /* diagonal portion of A */
708: nzi = adi[i+1] - adi[i];
709: for (j=0; j<nzi; j++) {
710: row = *adj++;
711: pnz = pi_loc[row+1] - pi_loc[row];
712: Jptr = pj_loc + pi_loc[row];
713: /* add non-zero cols of P into the sorted linked list lnk */
714: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
715: }
716: /* off-diagonal portion of A */
717: nzi = aoi[i+1] - aoi[i];
718: for (j=0; j<nzi; j++) {
719: row = *aoj++;
720: pnz = pi_oth[row+1] - pi_oth[row];
721: Jptr = pj_oth + pi_oth[row];
722: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
723: }
725: apnz = *lnk;
726: api[i+1] = api[i] + apnz;
727: if (apnz > apnz_max) apnz_max = apnz;
729: /* if free space is not available, double the total space in the list */
730: if (current_space->local_remaining<apnz) {
731: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
732: nspacedouble++;
733: }
735: /* Copy data into free space, then initialize lnk */
736: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
737: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
739: current_space->array += apnz;
740: current_space->local_used += apnz;
741: current_space->local_remaining -= apnz;
742: }
744: /* Allocate space for apj, initialize apj, and */
745: /* destroy list of free space and other temporary array(s) */
746: PetscMalloc1((api[am]+1),&ptap->apj);
747: apj = ptap->apj;
748: PetscFreeSpaceContiguous(&free_space,ptap->apj);
749: PetscLLCondensedDestroy_Scalable(lnk);
751: /* create and assemble symbolic parallel matrix Cmpi */
752: /*----------------------------------------------------*/
753: MatCreate(comm,&Cmpi);
754: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
755: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
756: MatSetType(Cmpi,MATMPIAIJ);
757: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
758: MatPreallocateFinalize(dnz,onz);
760: /* malloc apa for assembly Cmpi */
761: PetscCalloc1(apnz_max,&apa);
763: ptap->apa = apa;
764: for (i=0; i<am; i++) {
765: row = i + rstart;
766: apnz = api[i+1] - api[i];
767: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
768: apj += apnz;
769: }
770: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
771: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
773: ptap->destroy = Cmpi->ops->destroy;
774: ptap->duplicate = Cmpi->ops->duplicate;
775: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
776: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
777: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
779: /* attach the supporting struct to Cmpi for reuse */
780: c = (Mat_MPIAIJ*)Cmpi->data;
781: c->ptap = ptap;
783: *C = Cmpi;
785: /* set MatInfo */
786: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
787: if (afill < 1.0) afill = 1.0;
788: Cmpi->info.mallocs = nspacedouble;
789: Cmpi->info.fill_ratio_given = fill;
790: Cmpi->info.fill_ratio_needed = afill;
792: #if defined(PETSC_USE_INFO)
793: if (api[am]) {
794: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
795: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);
796: } else {
797: PetscInfo(Cmpi,"Empty matrix product\n");
798: }
799: #endif
800: return(0);
801: }
803: /*-------------------------------------------------------------------------*/
806: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
807: {
809: const char *algTypes[3] = {"scalable","nonscalable","matmatmult"};
810: PetscInt alg=0; /* set default algorithm */
813: if (scall == MAT_INITIAL_MATRIX) {
814: PetscObjectOptionsBegin((PetscObject)A);
815: PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);
816: PetscOptionsEnd();
818: PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);
819: switch (alg) {
820: case 1:
821: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);
822: break;
823: case 2:
824: {
825: Mat Pt;
826: Mat_PtAPMPI *ptap;
827: Mat_MPIAIJ *c;
828: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
829: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
830: c = (Mat_MPIAIJ*)(*C)->data;
831: ptap = c->ptap;
832: ptap->Pt = Pt;
833: (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult;
834: return(0);
835: }
836: break;
837: default:
838: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
839: break;
840: }
841: PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);
842: }
843: PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);
844: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
845: PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);
846: return(0);
847: }
849: /* This routine only works when scall=MAT_REUSE_MATRIX! */
852: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C)
853: {
855: Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data;
856: Mat_PtAPMPI *ptap= c->ptap;
857: Mat Pt=ptap->Pt;
860: MatTranspose(P,MAT_REUSE_MATRIX,&Pt);
861: MatMatMultNumeric(Pt,A,C);
862: return(0);
863: }
865: /* Non-scalable version, use dense axpy */
868: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C)
869: {
870: PetscErrorCode ierr;
871: Mat_Merge_SeqsToMPI *merge;
872: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
873: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
874: Mat_PtAPMPI *ptap;
875: PetscInt *adj,*aJ;
876: PetscInt i,j,k,anz,pnz,row,*cj;
877: MatScalar *ada,*aval,*ca,valtmp;
878: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
879: MPI_Comm comm;
880: PetscMPIInt size,rank,taga,*len_s;
881: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
882: PetscInt **buf_ri,**buf_rj;
883: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
884: MPI_Request *s_waits,*r_waits;
885: MPI_Status *status;
886: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
887: PetscInt *ai,*aj,*coi,*coj;
888: PetscInt *poJ,*pdJ;
889: Mat A_loc;
890: Mat_SeqAIJ *a_loc;
893: PetscObjectGetComm((PetscObject)C,&comm);
894: MPI_Comm_size(comm,&size);
895: MPI_Comm_rank(comm,&rank);
897: ptap = c->ptap;
898: merge = ptap->merge;
900: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
901: /*--------------------------------------------------------------*/
902: /* get data from symbolic products */
903: coi = merge->coi; coj = merge->coj;
904: PetscCalloc1((coi[pon]+1),&coa);
906: bi = merge->bi; bj = merge->bj;
907: owners = merge->rowmap->range;
908: PetscCalloc1((bi[cm]+1),&ba);
910: /* get A_loc by taking all local rows of A */
911: A_loc = ptap->A_loc;
912: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
913: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
914: ai = a_loc->i;
915: aj = a_loc->j;
917: PetscCalloc1((A->cmap->N),&aval); /* non-scalable!!! */
919: for (i=0; i<am; i++) {
920: /* 2-a) put A[i,:] to dense array aval */
921: anz = ai[i+1] - ai[i];
922: adj = aj + ai[i];
923: ada = a_loc->a + ai[i];
924: for (j=0; j<anz; j++) {
925: aval[adj[j]] = ada[j];
926: }
928: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
929: /*--------------------------------------------------------------*/
930: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
931: pnz = po->i[i+1] - po->i[i];
932: poJ = po->j + po->i[i];
933: pA = po->a + po->i[i];
934: for (j=0; j<pnz; j++) {
935: row = poJ[j];
936: cnz = coi[row+1] - coi[row];
937: cj = coj + coi[row];
938: ca = coa + coi[row];
939: /* perform dense axpy */
940: valtmp = pA[j];
941: for (k=0; k<cnz; k++) {
942: ca[k] += valtmp*aval[cj[k]];
943: }
944: PetscLogFlops(2.0*cnz);
945: }
947: /* put the value into Cd (diagonal part) */
948: pnz = pd->i[i+1] - pd->i[i];
949: pdJ = pd->j + pd->i[i];
950: pA = pd->a + pd->i[i];
951: for (j=0; j<pnz; j++) {
952: row = pdJ[j];
953: cnz = bi[row+1] - bi[row];
954: cj = bj + bi[row];
955: ca = ba + bi[row];
956: /* perform dense axpy */
957: valtmp = pA[j];
958: for (k=0; k<cnz; k++) {
959: ca[k] += valtmp*aval[cj[k]];
960: }
961: PetscLogFlops(2.0*cnz);
962: }
964: /* zero the current row of Pt*A */
965: aJ = aj + ai[i];
966: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
967: }
969: /* 3) send and recv matrix values coa */
970: /*------------------------------------*/
971: buf_ri = merge->buf_ri;
972: buf_rj = merge->buf_rj;
973: len_s = merge->len_s;
974: PetscCommGetNewTag(comm,&taga);
975: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
977: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
978: for (proc=0,k=0; proc<size; proc++) {
979: if (!len_s[proc]) continue;
980: i = merge->owners_co[proc];
981: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
982: k++;
983: }
984: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
985: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
987: PetscFree2(s_waits,status);
988: PetscFree(r_waits);
989: PetscFree(coa);
991: /* 4) insert local Cseq and received values into Cmpi */
992: /*----------------------------------------------------*/
993: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
994: for (k=0; k<merge->nrecv; k++) {
995: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
996: nrows = *(buf_ri_k[k]);
997: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
998: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
999: }
1001: for (i=0; i<cm; i++) {
1002: row = owners[rank] + i; /* global row index of C_seq */
1003: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1004: ba_i = ba + bi[i];
1005: bnz = bi[i+1] - bi[i];
1006: /* add received vals into ba */
1007: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1008: /* i-th row */
1009: if (i == *nextrow[k]) {
1010: cnz = *(nextci[k]+1) - *nextci[k];
1011: cj = buf_rj[k] + *(nextci[k]);
1012: ca = abuf_r[k] + *(nextci[k]);
1013: nextcj = 0;
1014: for (j=0; nextcj<cnz; j++) {
1015: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1016: ba_i[j] += ca[nextcj++];
1017: }
1018: }
1019: nextrow[k]++; nextci[k]++;
1020: PetscLogFlops(2.0*cnz);
1021: }
1022: }
1023: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1024: }
1025: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1026: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1028: PetscFree(ba);
1029: PetscFree(abuf_r[0]);
1030: PetscFree(abuf_r);
1031: PetscFree3(buf_ri_k,nextrow,nextci);
1032: PetscFree(aval);
1033: return(0);
1034: }
1036: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1039: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C)
1040: {
1041: PetscErrorCode ierr;
1042: Mat Cmpi,A_loc,POt,PDt;
1043: Mat_PtAPMPI *ptap;
1044: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1045: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1046: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1047: PetscInt nnz;
1048: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1049: PetscInt am=A->rmap->n,pn=P->cmap->n;
1050: PetscBT lnkbt;
1051: MPI_Comm comm;
1052: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1053: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1054: PetscInt len,proc,*dnz,*onz,*owners;
1055: PetscInt nzi,*bi,*bj;
1056: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1057: MPI_Request *swaits,*rwaits;
1058: MPI_Status *sstatus,rstatus;
1059: Mat_Merge_SeqsToMPI *merge;
1060: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1061: PetscReal afill =1.0,afill_tmp;
1062: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1063: PetscScalar *vals;
1064: Mat_SeqAIJ *a_loc, *pdt,*pot;
1067: PetscObjectGetComm((PetscObject)A,&comm);
1068: /* check if matrix local sizes are compatible */
1069: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1070: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1071: }
1073: MPI_Comm_size(comm,&size);
1074: MPI_Comm_rank(comm,&rank);
1076: /* create struct Mat_PtAPMPI and attached it to C later */
1077: PetscNew(&ptap);
1079: /* get A_loc by taking all local rows of A */
1080: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1082: ptap->A_loc = A_loc;
1084: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1085: ai = a_loc->i;
1086: aj = a_loc->j;
1088: /* determine symbolic Co=(p->B)^T*A - send to others */
1089: /*----------------------------------------------------*/
1090: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1091: pdt = (Mat_SeqAIJ*)PDt->data;
1092: pdti = pdt->i; pdtj = pdt->j;
1094: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1095: pot = (Mat_SeqAIJ*)POt->data;
1096: poti = pot->i; potj = pot->j;
1098: /* then, compute symbolic Co = (p->B)^T*A */
1099: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1100: PetscMalloc1((pon+1),&coi);
1101: coi[0] = 0;
1103: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1104: nnz = fill*(poti[pon] + ai[am]);
1105: PetscFreeSpaceGet(nnz,&free_space);
1106: current_space = free_space;
1108: /* create and initialize a linked list */
1109: i = PetscMax(pdt->rmax,pot->rmax);
1110: Crmax = i*a_loc->rmax*size;
1111: if (!Crmax || Crmax > aN) Crmax = aN;
1112: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1114: for (i=0; i<pon; i++) {
1115: pnz = poti[i+1] - poti[i];
1116: ptJ = potj + poti[i];
1117: for (j=0; j<pnz; j++) {
1118: row = ptJ[j]; /* row of A_loc == col of Pot */
1119: anz = ai[row+1] - ai[row];
1120: Jptr = aj + ai[row];
1121: /* add non-zero cols of AP into the sorted linked list lnk */
1122: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1123: }
1124: nnz = lnk[0];
1126: /* If free space is not available, double the total space in the list */
1127: if (current_space->local_remaining<nnz) {
1128: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1129: nspacedouble++;
1130: }
1132: /* Copy data into free space, and zero out denserows */
1133: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1135: current_space->array += nnz;
1136: current_space->local_used += nnz;
1137: current_space->local_remaining -= nnz;
1139: coi[i+1] = coi[i] + nnz;
1140: }
1142: PetscMalloc1((coi[pon]+1),&coj);
1143: PetscFreeSpaceContiguous(&free_space,coj);
1145: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1146: if (afill_tmp > afill) afill = afill_tmp;
1148: /* send j-array (coj) of Co to other processors */
1149: /*----------------------------------------------*/
1150: /* determine row ownership */
1151: PetscNew(&merge);
1152: PetscLayoutCreate(comm,&merge->rowmap);
1154: merge->rowmap->n = pn;
1155: merge->rowmap->bs = 1;
1157: PetscLayoutSetUp(merge->rowmap);
1158: owners = merge->rowmap->range;
1160: /* determine the number of messages to send, their lengths */
1161: PetscCalloc1(size,&len_si);
1162: PetscMalloc1(size,&merge->len_s);
1164: len_s = merge->len_s;
1165: merge->nsend = 0;
1167: PetscMalloc1((size+2),&owners_co);
1168: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1170: proc = 0;
1171: for (i=0; i<pon; i++) {
1172: while (prmap[i] >= owners[proc+1]) proc++;
1173: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1174: len_s[proc] += coi[i+1] - coi[i];
1175: }
1177: len = 0; /* max length of buf_si[] */
1178: owners_co[0] = 0;
1179: for (proc=0; proc<size; proc++) {
1180: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1181: if (len_si[proc]) {
1182: merge->nsend++;
1183: len_si[proc] = 2*(len_si[proc] + 1);
1184: len += len_si[proc];
1185: }
1186: }
1188: /* determine the number and length of messages to receive for coi and coj */
1189: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1190: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1192: /* post the Irecv and Isend of coj */
1193: PetscCommGetNewTag(comm,&tagj);
1194: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1195: PetscMalloc1((merge->nsend+1),&swaits);
1196: for (proc=0, k=0; proc<size; proc++) {
1197: if (!len_s[proc]) continue;
1198: i = owners_co[proc];
1199: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1200: k++;
1201: }
1203: /* receives and sends of coj are complete */
1204: PetscMalloc1(size,&sstatus);
1205: for (i=0; i<merge->nrecv; i++) {
1206: PetscMPIInt icompleted;
1207: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1208: }
1209: PetscFree(rwaits);
1210: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1212: /* send and recv coi */
1213: /*-------------------*/
1214: PetscCommGetNewTag(comm,&tagi);
1215: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1216: PetscMalloc1((len+1),&buf_s);
1217: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1218: for (proc=0,k=0; proc<size; proc++) {
1219: if (!len_s[proc]) continue;
1220: /* form outgoing message for i-structure:
1221: buf_si[0]: nrows to be sent
1222: [1:nrows]: row index (global)
1223: [nrows+1:2*nrows+1]: i-structure index
1224: */
1225: /*-------------------------------------------*/
1226: nrows = len_si[proc]/2 - 1;
1227: buf_si_i = buf_si + nrows+1;
1228: buf_si[0] = nrows;
1229: buf_si_i[0] = 0;
1230: nrows = 0;
1231: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1232: nzi = coi[i+1] - coi[i];
1233: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1234: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1235: nrows++;
1236: }
1237: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1238: k++;
1239: buf_si += len_si[proc];
1240: }
1241: i = merge->nrecv;
1242: while (i--) {
1243: PetscMPIInt icompleted;
1244: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1245: }
1246: PetscFree(rwaits);
1247: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1248: PetscFree(len_si);
1249: PetscFree(len_ri);
1250: PetscFree(swaits);
1251: PetscFree(sstatus);
1252: PetscFree(buf_s);
1254: /* compute the local portion of C (mpi mat) */
1255: /*------------------------------------------*/
1256: /* allocate bi array and free space for accumulating nonzero column info */
1257: PetscMalloc1((pn+1),&bi);
1258: bi[0] = 0;
1260: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1261: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1262: PetscFreeSpaceGet(nnz,&free_space);
1263: current_space = free_space;
1265: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1266: for (k=0; k<merge->nrecv; k++) {
1267: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1268: nrows = *buf_ri_k[k];
1269: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1270: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1271: }
1273: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1274: rmax = 0;
1275: for (i=0; i<pn; i++) {
1276: /* add pdt[i,:]*AP into lnk */
1277: pnz = pdti[i+1] - pdti[i];
1278: ptJ = pdtj + pdti[i];
1279: for (j=0; j<pnz; j++) {
1280: row = ptJ[j]; /* row of AP == col of Pt */
1281: anz = ai[row+1] - ai[row];
1282: Jptr = aj + ai[row];
1283: /* add non-zero cols of AP into the sorted linked list lnk */
1284: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1285: }
1287: /* add received col data into lnk */
1288: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1289: if (i == *nextrow[k]) { /* i-th row */
1290: nzi = *(nextci[k]+1) - *nextci[k];
1291: Jptr = buf_rj[k] + *nextci[k];
1292: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1293: nextrow[k]++; nextci[k]++;
1294: }
1295: }
1296: nnz = lnk[0];
1298: /* if free space is not available, make more free space */
1299: if (current_space->local_remaining<nnz) {
1300: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1301: nspacedouble++;
1302: }
1303: /* copy data into free space, then initialize lnk */
1304: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1305: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1307: current_space->array += nnz;
1308: current_space->local_used += nnz;
1309: current_space->local_remaining -= nnz;
1311: bi[i+1] = bi[i] + nnz;
1312: if (nnz > rmax) rmax = nnz;
1313: }
1314: PetscFree3(buf_ri_k,nextrow,nextci);
1316: PetscMalloc1((bi[pn]+1),&bj);
1317: PetscFreeSpaceContiguous(&free_space,bj);
1319: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1320: if (afill_tmp > afill) afill = afill_tmp;
1321: PetscLLCondensedDestroy(lnk,lnkbt);
1322: MatDestroy(&POt);
1323: MatDestroy(&PDt);
1325: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1326: /*----------------------------------------------------------------------------------*/
1327: PetscCalloc1(rmax+1,&vals);
1329: MatCreate(comm,&Cmpi);
1330: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1331: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1332: MatSetType(Cmpi,MATMPIAIJ);
1333: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1334: MatPreallocateFinalize(dnz,onz);
1335: MatSetBlockSize(Cmpi,1);
1336: for (i=0; i<pn; i++) {
1337: row = i + rstart;
1338: nnz = bi[i+1] - bi[i];
1339: Jptr = bj + bi[i];
1340: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1341: }
1342: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1343: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1344: PetscFree(vals);
1346: merge->bi = bi;
1347: merge->bj = bj;
1348: merge->coi = coi;
1349: merge->coj = coj;
1350: merge->buf_ri = buf_ri;
1351: merge->buf_rj = buf_rj;
1352: merge->owners_co = owners_co;
1353: merge->destroy = Cmpi->ops->destroy;
1354: merge->duplicate = Cmpi->ops->duplicate;
1356: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable;
1357: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1359: /* attach the supporting struct to Cmpi for reuse */
1360: c = (Mat_MPIAIJ*)Cmpi->data;
1361: c->ptap = ptap;
1362: ptap->api = NULL;
1363: ptap->apj = NULL;
1364: ptap->merge = merge;
1365: ptap->rmax = rmax;
1367: *C = Cmpi;
1368: #if defined(PETSC_USE_INFO)
1369: if (bi[pn] != 0) {
1370: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1371: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1372: } else {
1373: PetscInfo(Cmpi,"Empty matrix product\n");
1374: }
1375: #endif
1376: return(0);
1377: }
1381: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
1382: {
1383: PetscErrorCode ierr;
1384: Mat_Merge_SeqsToMPI *merge;
1385: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1386: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1387: Mat_PtAPMPI *ptap;
1388: PetscInt *adj;
1389: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1390: MatScalar *ada,*ca,valtmp;
1391: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1392: MPI_Comm comm;
1393: PetscMPIInt size,rank,taga,*len_s;
1394: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1395: PetscInt **buf_ri,**buf_rj;
1396: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1397: MPI_Request *s_waits,*r_waits;
1398: MPI_Status *status;
1399: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1400: PetscInt *ai,*aj,*coi,*coj;
1401: PetscInt *poJ,*pdJ;
1402: Mat A_loc;
1403: Mat_SeqAIJ *a_loc;
1406: PetscObjectGetComm((PetscObject)C,&comm);
1407: MPI_Comm_size(comm,&size);
1408: MPI_Comm_rank(comm,&rank);
1410: ptap = c->ptap;
1411: merge = ptap->merge;
1413: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1414: /*------------------------------------------*/
1415: /* get data from symbolic products */
1416: coi = merge->coi; coj = merge->coj;
1417: PetscCalloc1((coi[pon]+1),&coa);
1418: bi = merge->bi; bj = merge->bj;
1419: owners = merge->rowmap->range;
1420: PetscCalloc1(bi[cm]+1,&ba);
1422: /* get A_loc by taking all local rows of A */
1423: A_loc = ptap->A_loc;
1424: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1425: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1426: ai = a_loc->i;
1427: aj = a_loc->j;
1429: for (i=0; i<am; i++) {
1430: anz = ai[i+1] - ai[i];
1431: adj = aj + ai[i];
1432: ada = a_loc->a + ai[i];
1434: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1435: /*-------------------------------------------------------------*/
1436: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1437: pnz = po->i[i+1] - po->i[i];
1438: poJ = po->j + po->i[i];
1439: pA = po->a + po->i[i];
1440: for (j=0; j<pnz; j++) {
1441: row = poJ[j];
1442: cj = coj + coi[row];
1443: ca = coa + coi[row];
1444: /* perform sparse axpy */
1445: nexta = 0;
1446: valtmp = pA[j];
1447: for (k=0; nexta<anz; k++) {
1448: if (cj[k] == adj[nexta]) {
1449: ca[k] += valtmp*ada[nexta];
1450: nexta++;
1451: }
1452: }
1453: PetscLogFlops(2.0*anz);
1454: }
1456: /* put the value into Cd (diagonal part) */
1457: pnz = pd->i[i+1] - pd->i[i];
1458: pdJ = pd->j + pd->i[i];
1459: pA = pd->a + pd->i[i];
1460: for (j=0; j<pnz; j++) {
1461: row = pdJ[j];
1462: cj = bj + bi[row];
1463: ca = ba + bi[row];
1464: /* perform sparse axpy */
1465: nexta = 0;
1466: valtmp = pA[j];
1467: for (k=0; nexta<anz; k++) {
1468: if (cj[k] == adj[nexta]) {
1469: ca[k] += valtmp*ada[nexta];
1470: nexta++;
1471: }
1472: }
1473: PetscLogFlops(2.0*anz);
1474: }
1475: }
1477: /* 3) send and recv matrix values coa */
1478: /*------------------------------------*/
1479: buf_ri = merge->buf_ri;
1480: buf_rj = merge->buf_rj;
1481: len_s = merge->len_s;
1482: PetscCommGetNewTag(comm,&taga);
1483: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1485: PetscMalloc2(merge->nsend+1,&s_waits,size,&status);
1486: for (proc=0,k=0; proc<size; proc++) {
1487: if (!len_s[proc]) continue;
1488: i = merge->owners_co[proc];
1489: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1490: k++;
1491: }
1492: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1493: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1495: PetscFree2(s_waits,status);
1496: PetscFree(r_waits);
1497: PetscFree(coa);
1499: /* 4) insert local Cseq and received values into Cmpi */
1500: /*----------------------------------------------------*/
1501: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1502: for (k=0; k<merge->nrecv; k++) {
1503: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1504: nrows = *(buf_ri_k[k]);
1505: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1506: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1507: }
1509: for (i=0; i<cm; i++) {
1510: row = owners[rank] + i; /* global row index of C_seq */
1511: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1512: ba_i = ba + bi[i];
1513: bnz = bi[i+1] - bi[i];
1514: /* add received vals into ba */
1515: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1516: /* i-th row */
1517: if (i == *nextrow[k]) {
1518: cnz = *(nextci[k]+1) - *nextci[k];
1519: cj = buf_rj[k] + *(nextci[k]);
1520: ca = abuf_r[k] + *(nextci[k]);
1521: nextcj = 0;
1522: for (j=0; nextcj<cnz; j++) {
1523: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1524: ba_i[j] += ca[nextcj++];
1525: }
1526: }
1527: nextrow[k]++; nextci[k]++;
1528: PetscLogFlops(2.0*cnz);
1529: }
1530: }
1531: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1532: }
1533: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1534: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1536: PetscFree(ba);
1537: PetscFree(abuf_r[0]);
1538: PetscFree(abuf_r);
1539: PetscFree3(buf_ri_k,nextrow,nextci);
1540: return(0);
1541: }
1543: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ();
1544: differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */
1547: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1548: {
1549: PetscErrorCode ierr;
1550: Mat Cmpi,A_loc,POt,PDt;
1551: Mat_PtAPMPI *ptap;
1552: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1553: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1554: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1555: PetscInt nnz;
1556: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1557: PetscInt am =A->rmap->n,pn=P->cmap->n;
1558: MPI_Comm comm;
1559: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1560: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1561: PetscInt len,proc,*dnz,*onz,*owners;
1562: PetscInt nzi,*bi,*bj;
1563: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1564: MPI_Request *swaits,*rwaits;
1565: MPI_Status *sstatus,rstatus;
1566: Mat_Merge_SeqsToMPI *merge;
1567: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1568: PetscReal afill =1.0,afill_tmp;
1569: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1570: PetscScalar *vals;
1571: Mat_SeqAIJ *a_loc, *pdt,*pot;
1574: PetscObjectGetComm((PetscObject)A,&comm);
1575: /* check if matrix local sizes are compatible */
1576: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1577: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1578: }
1580: MPI_Comm_size(comm,&size);
1581: MPI_Comm_rank(comm,&rank);
1583: /* create struct Mat_PtAPMPI and attached it to C later */
1584: PetscNew(&ptap);
1586: /* get A_loc by taking all local rows of A */
1587: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1589: ptap->A_loc = A_loc;
1590: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1591: ai = a_loc->i;
1592: aj = a_loc->j;
1594: /* determine symbolic Co=(p->B)^T*A - send to others */
1595: /*----------------------------------------------------*/
1596: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1597: pdt = (Mat_SeqAIJ*)PDt->data;
1598: pdti = pdt->i; pdtj = pdt->j;
1600: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1601: pot = (Mat_SeqAIJ*)POt->data;
1602: poti = pot->i; potj = pot->j;
1604: /* then, compute symbolic Co = (p->B)^T*A */
1605: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1606: >= (num of nonzero rows of C_seq) - pn */
1607: PetscMalloc1((pon+1),&coi);
1608: coi[0] = 0;
1610: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1611: nnz = fill*(poti[pon] + ai[am]);
1612: PetscFreeSpaceGet(nnz,&free_space);
1613: current_space = free_space;
1615: /* create and initialize a linked list */
1616: i = PetscMax(pdt->rmax,pot->rmax);
1617: Crmax = i*a_loc->rmax*size; /* non-scalable! */
1618: if (!Crmax || Crmax > aN) Crmax = aN;
1619: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
1621: for (i=0; i<pon; i++) {
1622: pnz = poti[i+1] - poti[i];
1623: ptJ = potj + poti[i];
1624: for (j=0; j<pnz; j++) {
1625: row = ptJ[j]; /* row of A_loc == col of Pot */
1626: anz = ai[row+1] - ai[row];
1627: Jptr = aj + ai[row];
1628: /* add non-zero cols of AP into the sorted linked list lnk */
1629: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1630: }
1631: nnz = lnk[0];
1633: /* If free space is not available, double the total space in the list */
1634: if (current_space->local_remaining<nnz) {
1635: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1636: nspacedouble++;
1637: }
1639: /* Copy data into free space, and zero out denserows */
1640: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1642: current_space->array += nnz;
1643: current_space->local_used += nnz;
1644: current_space->local_remaining -= nnz;
1646: coi[i+1] = coi[i] + nnz;
1647: }
1649: PetscMalloc1((coi[pon]+1),&coj);
1650: PetscFreeSpaceContiguous(&free_space,coj);
1652: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1653: if (afill_tmp > afill) afill = afill_tmp;
1655: /* send j-array (coj) of Co to other processors */
1656: /*----------------------------------------------*/
1657: /* determine row ownership */
1658: PetscNew(&merge);
1659: PetscLayoutCreate(comm,&merge->rowmap);
1661: merge->rowmap->n = pn;
1662: merge->rowmap->bs = 1;
1664: PetscLayoutSetUp(merge->rowmap);
1665: owners = merge->rowmap->range;
1667: /* determine the number of messages to send, their lengths */
1668: PetscCalloc1(size,&len_si);
1669: PetscMalloc1(size,&merge->len_s);
1671: len_s = merge->len_s;
1672: merge->nsend = 0;
1674: PetscMalloc1((size+2),&owners_co);
1675: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1677: proc = 0;
1678: for (i=0; i<pon; i++) {
1679: while (prmap[i] >= owners[proc+1]) proc++;
1680: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1681: len_s[proc] += coi[i+1] - coi[i];
1682: }
1684: len = 0; /* max length of buf_si[] */
1685: owners_co[0] = 0;
1686: for (proc=0; proc<size; proc++) {
1687: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1688: if (len_si[proc]) {
1689: merge->nsend++;
1690: len_si[proc] = 2*(len_si[proc] + 1);
1691: len += len_si[proc];
1692: }
1693: }
1695: /* determine the number and length of messages to receive for coi and coj */
1696: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1697: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1699: /* post the Irecv and Isend of coj */
1700: PetscCommGetNewTag(comm,&tagj);
1701: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1702: PetscMalloc1((merge->nsend+1),&swaits);
1703: for (proc=0, k=0; proc<size; proc++) {
1704: if (!len_s[proc]) continue;
1705: i = owners_co[proc];
1706: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1707: k++;
1708: }
1710: /* receives and sends of coj are complete */
1711: PetscMalloc1(size,&sstatus);
1712: for (i=0; i<merge->nrecv; i++) {
1713: PetscMPIInt icompleted;
1714: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1715: }
1716: PetscFree(rwaits);
1717: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1719: /* send and recv coi */
1720: /*-------------------*/
1721: PetscCommGetNewTag(comm,&tagi);
1722: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1723: PetscMalloc1((len+1),&buf_s);
1724: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1725: for (proc=0,k=0; proc<size; proc++) {
1726: if (!len_s[proc]) continue;
1727: /* form outgoing message for i-structure:
1728: buf_si[0]: nrows to be sent
1729: [1:nrows]: row index (global)
1730: [nrows+1:2*nrows+1]: i-structure index
1731: */
1732: /*-------------------------------------------*/
1733: nrows = len_si[proc]/2 - 1;
1734: buf_si_i = buf_si + nrows+1;
1735: buf_si[0] = nrows;
1736: buf_si_i[0] = 0;
1737: nrows = 0;
1738: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1739: nzi = coi[i+1] - coi[i];
1740: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1741: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1742: nrows++;
1743: }
1744: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1745: k++;
1746: buf_si += len_si[proc];
1747: }
1748: i = merge->nrecv;
1749: while (i--) {
1750: PetscMPIInt icompleted;
1751: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1752: }
1753: PetscFree(rwaits);
1754: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1755: PetscFree(len_si);
1756: PetscFree(len_ri);
1757: PetscFree(swaits);
1758: PetscFree(sstatus);
1759: PetscFree(buf_s);
1761: /* compute the local portion of C (mpi mat) */
1762: /*------------------------------------------*/
1763: /* allocate bi array and free space for accumulating nonzero column info */
1764: PetscMalloc1((pn+1),&bi);
1765: bi[0] = 0;
1767: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1768: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1769: PetscFreeSpaceGet(nnz,&free_space);
1770: current_space = free_space;
1772: PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);
1773: for (k=0; k<merge->nrecv; k++) {
1774: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1775: nrows = *buf_ri_k[k];
1776: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1777: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1778: }
1780: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1781: rmax = 0;
1782: for (i=0; i<pn; i++) {
1783: /* add pdt[i,:]*AP into lnk */
1784: pnz = pdti[i+1] - pdti[i];
1785: ptJ = pdtj + pdti[i];
1786: for (j=0; j<pnz; j++) {
1787: row = ptJ[j]; /* row of AP == col of Pt */
1788: anz = ai[row+1] - ai[row];
1789: Jptr = aj + ai[row];
1790: /* add non-zero cols of AP into the sorted linked list lnk */
1791: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1792: }
1794: /* add received col data into lnk */
1795: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1796: if (i == *nextrow[k]) { /* i-th row */
1797: nzi = *(nextci[k]+1) - *nextci[k];
1798: Jptr = buf_rj[k] + *nextci[k];
1799: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1800: nextrow[k]++; nextci[k]++;
1801: }
1802: }
1803: nnz = lnk[0];
1805: /* if free space is not available, make more free space */
1806: if (current_space->local_remaining<nnz) {
1807: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1808: nspacedouble++;
1809: }
1810: /* copy data into free space, then initialize lnk */
1811: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1812: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1814: current_space->array += nnz;
1815: current_space->local_used += nnz;
1816: current_space->local_remaining -= nnz;
1818: bi[i+1] = bi[i] + nnz;
1819: if (nnz > rmax) rmax = nnz;
1820: }
1821: PetscFree3(buf_ri_k,nextrow,nextci);
1823: PetscMalloc1((bi[pn]+1),&bj);
1824: PetscFreeSpaceContiguous(&free_space,bj);
1825: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1826: if (afill_tmp > afill) afill = afill_tmp;
1827: PetscLLCondensedDestroy_Scalable(lnk);
1828: MatDestroy(&POt);
1829: MatDestroy(&PDt);
1831: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1832: /*----------------------------------------------------------------------------------*/
1833: PetscCalloc1(rmax+1,&vals);
1835: MatCreate(comm,&Cmpi);
1836: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1837: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1838: MatSetType(Cmpi,MATMPIAIJ);
1839: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1840: MatPreallocateFinalize(dnz,onz);
1841: MatSetBlockSize(Cmpi,1);
1842: for (i=0; i<pn; i++) {
1843: row = i + rstart;
1844: nnz = bi[i+1] - bi[i];
1845: Jptr = bj + bi[i];
1846: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1847: }
1848: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1849: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1850: PetscFree(vals);
1852: merge->bi = bi;
1853: merge->bj = bj;
1854: merge->coi = coi;
1855: merge->coj = coj;
1856: merge->buf_ri = buf_ri;
1857: merge->buf_rj = buf_rj;
1858: merge->owners_co = owners_co;
1859: merge->destroy = Cmpi->ops->destroy;
1860: merge->duplicate = Cmpi->ops->duplicate;
1862: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1863: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1865: /* attach the supporting struct to Cmpi for reuse */
1866: c = (Mat_MPIAIJ*)Cmpi->data;
1868: c->ptap = ptap;
1869: ptap->api = NULL;
1870: ptap->apj = NULL;
1871: ptap->merge = merge;
1872: ptap->rmax = rmax;
1873: ptap->apa = NULL;
1875: *C = Cmpi;
1876: #if defined(PETSC_USE_INFO)
1877: if (bi[pn] != 0) {
1878: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);
1879: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);
1880: } else {
1881: PetscInfo(Cmpi,"Empty matrix product\n");
1882: }
1883: #endif
1884: return(0);
1885: }