Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include src/mat/impls/aij/mpi/mpiaij.h
6: #include petscbt.h
8: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
9: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
11: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
17: {
19: PetscInt i;
22: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23: for (i=0; i<ov; ++i) {
24: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
25: }
26: return(0);
27: }
29: /*
30: Sample message format:
31: If a processor A wants processor B to process some elements corresponding
32: to index sets is[1],is[5]
33: mesg [0] = 2 (no of index sets in the mesg)
34: -----------
35: mesg [1] = 1 => is[1]
36: mesg [2] = sizeof(is[1]);
37: -----------
38: mesg [3] = 5 => is[5]
39: mesg [4] = sizeof(is[5]);
40: -----------
41: mesg [5]
42: mesg [n] datas[1]
43: -----------
44: mesg[n+1]
45: mesg[m] data(is[5])
46: -----------
47:
48: Notes:
49: nrqs - no of requests sent (or to be sent out)
50: nrqr - no of requests recieved (which have to be or which have been processed
51: */
54: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
55: {
56: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
57: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
58: PetscInt **idx,*n,*rtable,**data,len,*idx_i;
60: PetscMPIInt size,rank,tag1,tag2;
61: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
62: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
63: PetscBT *table;
64: MPI_Comm comm;
65: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
66: MPI_Status *s_status,*recv_status;
69: comm = C->comm;
70: size = c->size;
71: rank = c->rank;
72: m = C->M;
74: PetscObjectGetNewTag((PetscObject)C,&tag1);
75: PetscObjectGetNewTag((PetscObject)C,&tag2);
76:
77: len = (imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt);
78: PetscMalloc(len,&idx);
79: n = (PetscInt*)(idx + imax);
80: rtable = n + imax;
81:
82: for (i=0; i<imax; i++) {
83: ISGetIndices(is[i],&idx[i]);
84: ISGetLocalSize(is[i],&n[i]);
85: }
86:
87: /* Create hash table for the mapping :row -> proc*/
88: for (i=0,j=0; i<size; i++) {
89: len = c->rowners[i+1];
90: for (; j<len; j++) {
91: rtable[j] = i;
92: }
93: }
95: /* evaluate communication - mesg to who,length of mesg, and buffer space
96: required. Based on this, buffers are allocated, and data copied into them*/
97: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1);/* mesg size */
98: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
99: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
100: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
101: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialise work vector*/
102: for (i=0; i<imax; i++) {
103: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
104: idx_i = idx[i];
105: len = n[i];
106: for (j=0; j<len; j++) {
107: row = idx_i[j];
108: if (row < 0) {
109: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
110: }
111: proc = rtable[row];
112: w4[proc]++;
113: }
114: for (j=0; j<size; j++){
115: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
116: }
117: }
119: nrqs = 0; /* no of outgoing messages */
120: msz = 0; /* total mesg length (for all proc */
121: w1[rank] = 0; /* no mesg sent to intself */
122: w3[rank] = 0;
123: for (i=0; i<size; i++) {
124: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
125: }
126: /* pa - is list of processors to communicate with */
127: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
128: for (i=0,j=0; i<size; i++) {
129: if (w1[i]) {pa[j] = i; j++;}
130: }
132: /* Each message would have a header = 1 + 2*(no of IS) + data */
133: for (i=0; i<nrqs; i++) {
134: j = pa[i];
135: w1[j] += w2[j] + 2*w3[j];
136: msz += w1[j];
137: }
139: /* Determine the number of messages to expect, their lengths, from from-ids */
140: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
141: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
143: /* Now post the Irecvs corresponding to these messages */
144: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
146: /* Allocate Memory for outgoing messages */
147: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
148: PetscMalloc(len,&outdat);
149: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
150: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
151: tmp = (PetscInt*)(outdat + 2*size);
152: ctr = tmp + msz;
154: {
155: PetscInt *iptr = tmp,ict = 0;
156: for (i=0; i<nrqs; i++) {
157: j = pa[i];
158: iptr += ict;
159: outdat[j] = iptr;
160: ict = w1[j];
161: }
162: }
164: /* Form the outgoing messages */
165: /*plug in the headers*/
166: for (i=0; i<nrqs; i++) {
167: j = pa[i];
168: outdat[j][0] = 0;
169: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
170: ptr[j] = outdat[j] + 2*w3[j] + 1;
171: }
172:
173: /* Memory for doing local proc's work*/
174: {
175: PetscInt *d_p;
176: char *t_p;
178: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
179: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
180: PetscMalloc(len,&table);
181: PetscMemzero(table,len);
182: data = (PetscInt **)(table + imax);
183: isz = (PetscInt *)(data + imax);
184: d_p = (PetscInt *)(isz + imax);
185: t_p = (char *)(d_p + m*imax);
186: for (i=0; i<imax; i++) {
187: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
188: data[i] = d_p + (m)*i;
189: }
190: }
192: /* Parse the IS and update local tables and the outgoing buf with the data*/
193: {
194: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
195: PetscBT table_i;
197: for (i=0; i<imax; i++) {
198: PetscMemzero(ctr,size*sizeof(PetscInt));
199: n_i = n[i];
200: table_i = table[i];
201: idx_i = idx[i];
202: data_i = data[i];
203: isz_i = isz[i];
204: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
205: row = idx_i[j];
206: proc = rtable[row];
207: if (proc != rank) { /* copy to the outgoing buffer */
208: ctr[proc]++;
209: *ptr[proc] = row;
210: ptr[proc]++;
211: } else { /* Update the local table */
212: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
213: }
214: }
215: /* Update the headers for the current IS */
216: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
217: if ((ctr_j = ctr[j])) {
218: outdat_j = outdat[j];
219: k = ++outdat_j[0];
220: outdat_j[2*k] = ctr_j;
221: outdat_j[2*k-1] = i;
222: }
223: }
224: isz[i] = isz_i;
225: }
226: }
227:
230: /* Now post the sends */
231: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
232: for (i=0; i<nrqs; ++i) {
233: j = pa[i];
234: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
235: }
236:
237: /* No longer need the original indices*/
238: for (i=0; i<imax; ++i) {
239: ISRestoreIndices(is[i],idx+i);
240: }
241: PetscFree(idx);
243: for (i=0; i<imax; ++i) {
244: ISDestroy(is[i]);
245: }
246:
247: /* Do Local work*/
248: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
250: /* Receive messages*/
251: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
252: MPI_Waitall(nrqr,r_waits1,recv_status);
253:
254: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
255: MPI_Waitall(nrqs,s_waits1,s_status);
257: /* Phase 1 sends are complete - deallocate buffers */
258: PetscFree(outdat);
259: PetscFree(w1);
261: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
262: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
263: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
264: PetscFree(rbuf);
266:
267: /* Send the data back*/
268: /* Do a global reduction to know the buffer space req for incoming messages*/
269: {
270: PetscMPIInt *rw1;
271:
272: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
273: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
275: for (i=0; i<nrqr; ++i) {
276: proc = recv_status[i].MPI_SOURCE;
277: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
278: rw1[proc] = isz1[i];
279: }
280: PetscFree(onodes1);
281: PetscFree(olengths1);
283: /* Determine the number of messages to expect, their lengths, from from-ids */
284: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
285: PetscFree(rw1);
286: }
287: /* Now post the Irecvs corresponding to these messages */
288: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
290: /* Now post the sends */
291: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
292: for (i=0; i<nrqr; ++i) {
293: j = recv_status[i].MPI_SOURCE;
294: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
295: }
297: /* receive work done on other processors*/
298: {
299: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
300: PetscMPIInt idex;
301: PetscBT table_i;
302: MPI_Status *status2;
303:
304: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
305: for (i=0; i<nrqs; ++i) {
306: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
307: /* Process the message*/
308: rbuf2_i = rbuf2[idex];
309: ct1 = 2*rbuf2_i[0]+1;
310: jmax = rbuf2[idex][0];
311: for (j=1; j<=jmax; j++) {
312: max = rbuf2_i[2*j];
313: is_no = rbuf2_i[2*j-1];
314: isz_i = isz[is_no];
315: data_i = data[is_no];
316: table_i = table[is_no];
317: for (k=0; k<max; k++,ct1++) {
318: row = rbuf2_i[ct1];
319: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
320: }
321: isz[is_no] = isz_i;
322: }
323: }
325: MPI_Waitall(nrqr,s_waits2,status2);
326: PetscFree(status2);
327: }
328:
329: for (i=0; i<imax; ++i) {
330: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
331: }
332:
333: PetscFree(onodes2);
334: PetscFree(olengths2);
336: PetscFree(pa);
337: PetscFree(rbuf2);
338: PetscFree(s_waits1);
339: PetscFree(r_waits1);
340: PetscFree(s_waits2);
341: PetscFree(r_waits2);
342: PetscFree(table);
343: PetscFree(s_status);
344: PetscFree(recv_status);
345: PetscFree(xdata[0]);
346: PetscFree(xdata);
347: PetscFree(isz1);
348: return(0);
349: }
353: /*
354: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
355: the work on the local processor.
357: Inputs:
358: C - MAT_MPIAIJ;
359: imax - total no of index sets processed at a time;
360: table - an array of char - size = m bits.
361:
362: Output:
363: isz - array containing the count of the solution elements correspondign
364: to each index set;
365: data - pointer to the solutions
366: */
367: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
368: {
369: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
370: Mat A = c->A,B = c->B;
371: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
372: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
373: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
374: PetscBT table_i;
377: rstart = c->rstart;
378: cstart = c->cstart;
379: ai = a->i;
380: aj = a->j;
381: bi = b->i;
382: bj = b->j;
383: garray = c->garray;
385:
386: for (i=0; i<imax; i++) {
387: data_i = data[i];
388: table_i = table[i];
389: isz_i = isz[i];
390: for (j=0,max=isz[i]; j<max; j++) {
391: row = data_i[j] - rstart;
392: start = ai[row];
393: end = ai[row+1];
394: for (k=start; k<end; k++) { /* Amat */
395: val = aj[k] + cstart;
396: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
397: }
398: start = bi[row];
399: end = bi[row+1];
400: for (k=start; k<end; k++) { /* Bmat */
401: val = garray[bj[k]];
402: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
403: }
404: }
405: isz[i] = isz_i;
406: }
407: return(0);
408: }
412: /*
413: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
414: and return the output
416: Input:
417: C - the matrix
418: nrqr - no of messages being processed.
419: rbuf - an array of pointers to the recieved requests
420:
421: Output:
422: xdata - array of messages to be sent back
423: isz1 - size of each message
425: For better efficiency perhaps we should malloc seperately each xdata[i],
426: then if a remalloc is required we need only copy the data for that one row
427: rather then all previous rows as it is now where a single large chunck of
428: memory is used.
430: */
431: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
432: {
433: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
434: Mat A = c->A,B = c->B;
435: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
437: PetscMPIInt rank;
438: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
439: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
440: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
441: PetscInt *rbuf_i,kmax,rbuf_0;
442: PetscBT xtable;
445: rank = c->rank;
446: m = C->M;
447: rstart = c->rstart;
448: cstart = c->cstart;
449: ai = a->i;
450: aj = a->j;
451: bi = b->i;
452: bj = b->j;
453: garray = c->garray;
454:
455:
456: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
457: rbuf_i = rbuf[i];
458: rbuf_0 = rbuf_i[0];
459: ct += rbuf_0;
460: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
461: }
462:
463: if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
464: else max1 = 1;
465: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
466: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
467: ++no_malloc;
468: PetscBTCreate(m,xtable);
469: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
470:
471: ct3 = 0;
472: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
473: rbuf_i = rbuf[i];
474: rbuf_0 = rbuf_i[0];
475: ct1 = 2*rbuf_0+1;
476: ct2 = ct1;
477: ct3 += ct1;
478: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
479: PetscBTMemzero(m,xtable);
480: oct2 = ct2;
481: kmax = rbuf_i[2*j];
482: for (k=0; k<kmax; k++,ct1++) {
483: row = rbuf_i[ct1];
484: if (!PetscBTLookupSet(xtable,row)) {
485: if (!(ct3 < mem_estimate)) {
486: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
487: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
488: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
489: PetscFree(xdata[0]);
490: xdata[0] = tmp;
491: mem_estimate = new_estimate; ++no_malloc;
492: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
493: }
494: xdata[i][ct2++] = row;
495: ct3++;
496: }
497: }
498: for (k=oct2,max2=ct2; k<max2; k++) {
499: row = xdata[i][k] - rstart;
500: start = ai[row];
501: end = ai[row+1];
502: for (l=start; l<end; l++) {
503: val = aj[l] + cstart;
504: if (!PetscBTLookupSet(xtable,val)) {
505: if (!(ct3 < mem_estimate)) {
506: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
507: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
508: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
509: PetscFree(xdata[0]);
510: xdata[0] = tmp;
511: mem_estimate = new_estimate; ++no_malloc;
512: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
513: }
514: xdata[i][ct2++] = val;
515: ct3++;
516: }
517: }
518: start = bi[row];
519: end = bi[row+1];
520: for (l=start; l<end; l++) {
521: val = garray[bj[l]];
522: if (!PetscBTLookupSet(xtable,val)) {
523: if (!(ct3 < mem_estimate)) {
524: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
525: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
526: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
527: PetscFree(xdata[0]);
528: xdata[0] = tmp;
529: mem_estimate = new_estimate; ++no_malloc;
530: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
531: }
532: xdata[i][ct2++] = val;
533: ct3++;
534: }
535: }
536: }
537: /* Update the header*/
538: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
539: xdata[i][2*j-1] = rbuf_i[2*j-1];
540: }
541: xdata[i][0] = rbuf_0;
542: xdata[i+1] = xdata[i] + ct2;
543: isz1[i] = ct2; /* size of each message */
544: }
545: PetscBTDestroy(xtable);
546: PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate, ct3,no_malloc);
547: return(0);
548: }
549: /* -------------------------------------------------------------------------*/
550: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
551: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
552: /*
553: Every processor gets the entire matrix
554: */
557: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
558: {
559: Mat B;
560: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
561: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
563: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
564: PetscInt sendcount,i,*rstarts = a->rowners,n,cnt,j;
565: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
566: PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
569: MPI_Comm_size(A->comm,&size);
570: MPI_Comm_rank(A->comm,&rank);
572: if (scall == MAT_INITIAL_MATRIX) {
573: /* ----------------------------------------------------------------
574: Tell every processor the number of nonzeros per row
575: */
576: PetscMalloc(A->M*sizeof(PetscInt),&lens);
577: for (i=a->rstart; i<a->rend; i++) {
578: lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
579: }
580: sendcount = a->rend - a->rstart;
581: PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);
582: displs = recvcounts + size;
583: for (i=0; i<size; i++) {
584: recvcounts[i] = a->rowners[i+1] - a->rowners[i];
585: displs[i] = a->rowners[i];
586: }
587: MPI_Allgatherv(lens+a->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,A->comm);
589: /* ---------------------------------------------------------------
590: Create the sequential matrix of the same type as the local block diagonal
591: */
592: MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);
593: MatSetType(B,a->A->type_name);
594: MatSeqAIJSetPreallocation(B,0,lens);
595: PetscMalloc(sizeof(Mat),Bin);
596: **Bin = B;
597: b = (Mat_SeqAIJ *)B->data;
599: /*--------------------------------------------------------------------
600: Copy my part of matrix column indices over
601: */
602: sendcount = ad->nz + bd->nz;
603: jsendbuf = b->j + b->i[rstarts[rank]];
604: a_jsendbuf = ad->j;
605: b_jsendbuf = bd->j;
606: n = a->rend - a->rstart;
607: cnt = 0;
608: for (i=0; i<n; i++) {
610: /* put in lower diagonal portion */
611: m = bd->i[i+1] - bd->i[i];
612: while (m > 0) {
613: /* is it above diagonal (in bd (compressed) numbering) */
614: if (garray[*b_jsendbuf] > a->rstart + i) break;
615: jsendbuf[cnt++] = garray[*b_jsendbuf++];
616: m--;
617: }
619: /* put in diagonal portion */
620: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
621: jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
622: }
624: /* put in upper diagonal portion */
625: while (m-- > 0) {
626: jsendbuf[cnt++] = garray[*b_jsendbuf++];
627: }
628: }
629: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
631: /*--------------------------------------------------------------------
632: Gather all column indices to all processors
633: */
634: for (i=0; i<size; i++) {
635: recvcounts[i] = 0;
636: for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
637: recvcounts[i] += lens[j];
638: }
639: }
640: displs[0] = 0;
641: for (i=1; i<size; i++) {
642: displs[i] = displs[i-1] + recvcounts[i-1];
643: }
644: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,A->comm);
646: /*--------------------------------------------------------------------
647: Assemble the matrix into useable form (note numerical values not yet set)
648: */
649: /* set the b->ilen (length of each row) values */
650: PetscMemcpy(b->ilen,lens,A->M*sizeof(PetscInt));
651: /* set the b->i indices */
652: b->i[0] = 0;
653: for (i=1; i<=A->M; i++) {
654: b->i[i] = b->i[i-1] + lens[i-1];
655: }
656: PetscFree(lens);
657: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
658: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
660: } else {
661: B = **Bin;
662: b = (Mat_SeqAIJ *)B->data;
663: }
665: /*--------------------------------------------------------------------
666: Copy my part of matrix numerical values into the values location
667: */
668: sendcount = ad->nz + bd->nz;
669: sendbuf = b->a + b->i[rstarts[rank]];
670: a_sendbuf = ad->a;
671: b_sendbuf = bd->a;
672: b_sendj = bd->j;
673: n = a->rend - a->rstart;
674: cnt = 0;
675: for (i=0; i<n; i++) {
677: /* put in lower diagonal portion */
678: m = bd->i[i+1] - bd->i[i];
679: while (m > 0) {
680: /* is it above diagonal (in bd (compressed) numbering) */
681: if (garray[*b_sendj] > a->rstart + i) break;
682: sendbuf[cnt++] = *b_sendbuf++;
683: m--;
684: b_sendj++;
685: }
687: /* put in diagonal portion */
688: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
689: sendbuf[cnt++] = *a_sendbuf++;
690: }
692: /* put in upper diagonal portion */
693: while (m-- > 0) {
694: sendbuf[cnt++] = *b_sendbuf++;
695: b_sendj++;
696: }
697: }
698: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
699:
700: /* -----------------------------------------------------------------
701: Gather all numerical values to all processors
702: */
703: if (!recvcounts) {
704: PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);
705: displs = recvcounts + size;
706: }
707: for (i=0; i<size; i++) {
708: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
709: }
710: displs[0] = 0;
711: for (i=1; i<size; i++) {
712: displs[i] = displs[i-1] + recvcounts[i-1];
713: }
714: recvbuf = b->a;
715: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);
716: PetscFree(recvcounts);
717: if (A->symmetric){
718: MatSetOption(B,MAT_SYMMETRIC);
719: } else if (A->hermitian) {
720: MatSetOption(B,MAT_HERMITIAN);
721: } else if (A->structurally_symmetric) {
722: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);
723: }
725: return(0);
726: }
730: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
731: {
733: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
734: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
737: /*
738: Check for special case each processor gets entire matrix
739: */
740: if (ismax == 1 && C->M == C->N) {
741: ISIdentity(*isrow,&rowflag);
742: ISIdentity(*iscol,&colflag);
743: ISGetLocalSize(*isrow,&nrow);
744: ISGetLocalSize(*iscol,&ncol);
745: if (rowflag && colflag && nrow == C->M && ncol == C->N) {
746: wantallmatrix = PETSC_TRUE;
747: PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
748: }
749: }
750: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);
751: if (twantallmatrix) {
752: MatGetSubMatrix_MPIAIJ_All(C,scall,submat);
753: return(0);
754: }
756: /* Allocate memory to hold all the submatrices */
757: if (scall != MAT_REUSE_MATRIX) {
758: PetscMalloc((ismax+1)*sizeof(Mat),submat);
759: }
760: /* Determine the number of stages through which submatrices are done */
761: nmax = 20*1000000 / (C->N * sizeof(PetscInt));
762: if (!nmax) nmax = 1;
763: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
765: /* Make sure every processor loops through the nstages */
766: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,C->comm);
768: for (i=0,pos=0; i<nstages; i++) {
769: if (pos+nmax <= ismax) max_no = nmax;
770: else if (pos == ismax) max_no = 0;
771: else max_no = ismax-pos;
772: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
773: pos += max_no;
774: }
775: return(0);
776: }
777: /* -------------------------------------------------------------------------*/
780: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
781: {
782: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
783: Mat A = c->A;
784: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
785: PetscInt **irow,**icol,*nrow,*ncol,start;
787: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
788: PetscInt **sbuf1,**sbuf2,m,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
789: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
790: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
791: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
792: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
793: PetscInt *rmap_i;
794: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
795: MPI_Request *r_waits4,*s_waits3,*s_waits4;
796: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
797: MPI_Status *r_status3,*r_status4,*s_status4;
798: MPI_Comm comm;
799: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
800: PetscTruth sorted;
801: PetscMPIInt *onodes1,*olengths1;
802: PetscMPIInt idex,idex2,end;
805: comm = C->comm;
806: tag0 = C->tag;
807: size = c->size;
808: rank = c->rank;
809: m = C->M;
810:
811: /* Get some new tags to keep the communication clean */
812: PetscObjectGetNewTag((PetscObject)C,&tag1);
813: PetscObjectGetNewTag((PetscObject)C,&tag2);
814: PetscObjectGetNewTag((PetscObject)C,&tag3);
816: /* Check if the col indices are sorted */
817: for (i=0; i<ismax; i++) {
818: ISSorted(isrow[i],&sorted);
819: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
820: ISSorted(iscol[i],&sorted);
821: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
822: }
824: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
825: PetscMalloc(len,&irow);
826: icol = irow + ismax;
827: nrow = (PetscInt*)(icol + ismax);
828: ncol = nrow + ismax;
830: for (i=0; i<ismax; i++) {
831: ISGetIndices(isrow[i],&irow[i]);
832: ISGetIndices(iscol[i],&icol[i]);
833: ISGetLocalSize(isrow[i],&nrow[i]);
834: ISGetLocalSize(iscol[i],&ncol[i]);
835: }
837: /* evaluate communication - mesg to who, length of mesg, and buffer space
838: required. Based on this, buffers are allocated, and data copied into them*/
839: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1); /* mesg size */
840: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
841: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
842: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
843: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialize work vector*/
844: for (i=0; i<ismax; i++) {
845: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
846: jmax = nrow[i];
847: irow_i = irow[i];
848: for (l=0,j=0; j<jmax; j++) {
849: row = irow_i[j];
850: while (row >= c->rowners[l+1]) l++;
851: proc = l;
852: w4[proc]++;
853: }
854: for (j=0; j<size; j++) {
855: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
856: }
857: }
858:
859: nrqs = 0; /* no of outgoing messages */
860: msz = 0; /* total mesg length (for all procs) */
861: w1[rank] = 0; /* no mesg sent to self */
862: w3[rank] = 0;
863: for (i=0; i<size; i++) {
864: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
865: }
866: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
867: for (i=0,j=0; i<size; i++) {
868: if (w1[i]) { pa[j] = i; j++; }
869: }
871: /* Each message would have a header = 1 + 2*(no of IS) + data */
872: for (i=0; i<nrqs; i++) {
873: j = pa[i];
874: w1[j] += w2[j] + 2* w3[j];
875: msz += w1[j];
876: }
878: /* Determine the number of messages to expect, their lengths, from from-ids */
879: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
880: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
882: /* Now post the Irecvs corresponding to these messages */
883: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
884:
885: PetscFree(onodes1);
886: PetscFree(olengths1);
887:
888: /* Allocate Memory for outgoing messages */
889: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
890: PetscMalloc(len,&sbuf1);
891: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
892: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
893: /* allocate memory for outgoing data + buf to receive the first reply */
894: tmp = (PetscInt*)(ptr + size);
895: ctr = tmp + 2*msz;
897: {
898: PetscInt *iptr = tmp,ict = 0;
899: for (i=0; i<nrqs; i++) {
900: j = pa[i];
901: iptr += ict;
902: sbuf1[j] = iptr;
903: ict = w1[j];
904: }
905: }
907: /* Form the outgoing messages */
908: /* Initialize the header space */
909: for (i=0; i<nrqs; i++) {
910: j = pa[i];
911: sbuf1[j][0] = 0;
912: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
913: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
914: }
915:
916: /* Parse the isrow and copy data into outbuf */
917: for (i=0; i<ismax; i++) {
918: PetscMemzero(ctr,size*sizeof(PetscInt));
919: irow_i = irow[i];
920: jmax = nrow[i];
921: for (l=0,j=0; j<jmax; j++) { /* parse the indices of each IS */
922: row = irow_i[j];
923: while (row >= c->rowners[l+1]) l++;
924: proc = l;
925: if (proc != rank) { /* copy to the outgoing buf*/
926: ctr[proc]++;
927: *ptr[proc] = row;
928: ptr[proc]++;
929: }
930: }
931: /* Update the headers for the current IS */
932: for (j=0; j<size; j++) { /* Can Optimise this loop too */
933: if ((ctr_j = ctr[j])) {
934: sbuf1_j = sbuf1[j];
935: k = ++sbuf1_j[0];
936: sbuf1_j[2*k] = ctr_j;
937: sbuf1_j[2*k-1] = i;
938: }
939: }
940: }
942: /* Now post the sends */
943: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
944: for (i=0; i<nrqs; ++i) {
945: j = pa[i];
946: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
947: }
949: /* Post Receives to capture the buffer size */
950: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
951: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
952: rbuf2[0] = tmp + msz;
953: for (i=1; i<nrqs; ++i) {
954: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
955: }
956: for (i=0; i<nrqs; ++i) {
957: j = pa[i];
958: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
959: }
961: /* Send to other procs the buf size they should allocate */
962:
964: /* Receive messages*/
965: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
966: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
967: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
968: PetscMalloc(len,&sbuf2);
969: req_size = (PetscInt*)(sbuf2 + nrqr);
970: req_source = req_size + nrqr;
971:
972: {
973: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
974: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
975: PetscInt *sbuf2_i;
977: for (i=0; i<nrqr; ++i) {
978: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
979: req_size[idex] = 0;
980: rbuf1_i = rbuf1[idex];
981: start = 2*rbuf1_i[0] + 1;
982: MPI_Get_count(r_status1+i,MPIU_INT,&end);
983: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
984: sbuf2_i = sbuf2[idex];
985: for (j=start; j<end; j++) {
986: id = rbuf1_i[j] - rstart;
987: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
988: sbuf2_i[j] = ncols;
989: req_size[idex] += ncols;
990: }
991: req_source[idex] = r_status1[i].MPI_SOURCE;
992: /* form the header */
993: sbuf2_i[0] = req_size[idex];
994: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
995: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
996: }
997: }
998: PetscFree(r_status1);
999: PetscFree(r_waits1);
1001: /* recv buffer sizes */
1002: /* Receive messages*/
1003:
1004: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1005: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1006: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1007: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1008: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1010: for (i=0; i<nrqs; ++i) {
1011: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1012: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1013: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1014: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1015: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1016: }
1017: PetscFree(r_status2);
1018: PetscFree(r_waits2);
1019:
1020: /* Wait on sends1 and sends2 */
1021: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1022: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1024: MPI_Waitall(nrqs,s_waits1,s_status1);
1025: MPI_Waitall(nrqr,s_waits2,s_status2);
1026: PetscFree(s_status1);
1027: PetscFree(s_status2);
1028: PetscFree(s_waits1);
1029: PetscFree(s_waits2);
1031: /* Now allocate buffers for a->j, and send them off */
1032: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1033: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1034: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1035: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1036:
1037: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1038: {
1039: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1040: PetscInt *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1041: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1043: for (i=0; i<nrqr; i++) {
1044: rbuf1_i = rbuf1[i];
1045: sbuf_aj_i = sbuf_aj[i];
1046: ct1 = 2*rbuf1_i[0] + 1;
1047: ct2 = 0;
1048: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1049: kmax = rbuf1[i][2*j];
1050: for (k=0; k<kmax; k++,ct1++) {
1051: row = rbuf1_i[ct1] - rstart;
1052: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1053: ncols = nzA + nzB;
1054: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1056: /* load the column indices for this row into cols*/
1057: cols = sbuf_aj_i + ct2;
1058:
1059: for (l=0; l<nzB; l++) {
1060: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1061: else break;
1062: }
1063: imark = l;
1064: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1065: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1067: ct2 += ncols;
1068: }
1069: }
1070: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1071: }
1072: }
1073: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1074: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1076: /* Allocate buffers for a->a, and send them off */
1077: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1078: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1079: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1080: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1081:
1082: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1083: {
1084: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1085: PetscInt cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1086: PetscInt *b_j = b->j;
1087: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1088:
1089: for (i=0; i<nrqr; i++) {
1090: rbuf1_i = rbuf1[i];
1091: sbuf_aa_i = sbuf_aa[i];
1092: ct1 = 2*rbuf1_i[0]+1;
1093: ct2 = 0;
1094: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1095: kmax = rbuf1_i[2*j];
1096: for (k=0; k<kmax; k++,ct1++) {
1097: row = rbuf1_i[ct1] - rstart;
1098: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1099: ncols = nzA + nzB;
1100: cworkB = b_j + b_i[row];
1101: vworkA = a_a + a_i[row];
1102: vworkB = b_a + b_i[row];
1104: /* load the column values for this row into vals*/
1105: vals = sbuf_aa_i+ct2;
1106:
1107: for (l=0; l<nzB; l++) {
1108: if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l];
1109: else break;
1110: }
1111: imark = l;
1112: for (l=0; l<nzA; l++) vals[imark+l] = vworkA[l];
1113: for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1114:
1115: ct2 += ncols;
1116: }
1117: }
1118: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1119: }
1120: }
1121: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1122: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1123: PetscFree(rbuf1);
1125: /* Form the matrix */
1126: /* create col map */
1127: {
1128: PetscInt *icol_i;
1129:
1130: len = (1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->N)*sizeof(PetscInt);
1131: PetscMalloc(len,&cmap);
1132: cmap[0] = (PetscInt*)(cmap + ismax);
1133: PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(PetscInt));
1134: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1135: for (i=0; i<ismax; i++) {
1136: jmax = ncol[i];
1137: icol_i = icol[i];
1138: cmap_i = cmap[i];
1139: for (j=0; j<jmax; j++) {
1140: cmap_i[icol_i[j]] = j+1;
1141: }
1142: }
1143: }
1145: /* Create lens which is required for MatCreate... */
1146: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1147: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1148: PetscMalloc(len,&lens);
1149: lens[0] = (PetscInt*)(lens + ismax);
1150: PetscMemzero(lens[0],j*sizeof(PetscInt));
1151: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1152:
1153: /* Update lens from local data */
1154: for (i=0; i<ismax; i++) {
1155: jmax = nrow[i];
1156: cmap_i = cmap[i];
1157: irow_i = irow[i];
1158: lens_i = lens[i];
1159: for (l=0,j=0; j<jmax; j++) {
1160: row = irow_i[j];
1161: while (row >= c->rowners[l+1]) l++;
1162: proc = l;
1163: if (proc == rank) {
1164: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1165: for (k=0; k<ncols; k++) {
1166: if (cmap_i[cols[k]]) { lens_i[j]++;}
1167: }
1168: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1169: }
1170: }
1171: }
1172:
1173: /* Create row map*/
1174: len = (1+ismax)*sizeof(PetscInt*)+ ismax*C->M*sizeof(PetscInt);
1175: PetscMalloc(len,&rmap);
1176: rmap[0] = (PetscInt*)(rmap + ismax);
1177: PetscMemzero(rmap[0],ismax*C->M*sizeof(PetscInt));
1178: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1179: for (i=0; i<ismax; i++) {
1180: rmap_i = rmap[i];
1181: irow_i = irow[i];
1182: jmax = nrow[i];
1183: for (j=0; j<jmax; j++) {
1184: rmap_i[irow_i[j]] = j;
1185: }
1186: }
1187:
1188: /* Update lens from offproc data */
1189: {
1190: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1192: for (tmp2=0; tmp2<nrqs; tmp2++) {
1193: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1194: idex = pa[idex2];
1195: sbuf1_i = sbuf1[idex];
1196: jmax = sbuf1_i[0];
1197: ct1 = 2*jmax+1;
1198: ct2 = 0;
1199: rbuf2_i = rbuf2[idex2];
1200: rbuf3_i = rbuf3[idex2];
1201: for (j=1; j<=jmax; j++) {
1202: is_no = sbuf1_i[2*j-1];
1203: max1 = sbuf1_i[2*j];
1204: lens_i = lens[is_no];
1205: cmap_i = cmap[is_no];
1206: rmap_i = rmap[is_no];
1207: for (k=0; k<max1; k++,ct1++) {
1208: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1209: max2 = rbuf2_i[ct1];
1210: for (l=0; l<max2; l++,ct2++) {
1211: if (cmap_i[rbuf3_i[ct2]]) {
1212: lens_i[row]++;
1213: }
1214: }
1215: }
1216: }
1217: }
1218: }
1219: PetscFree(r_status3);
1220: PetscFree(r_waits3);
1221: MPI_Waitall(nrqr,s_waits3,s_status3);
1222: PetscFree(s_status3);
1223: PetscFree(s_waits3);
1225: /* Create the submatrices */
1226: if (scall == MAT_REUSE_MATRIX) {
1227: PetscTruth flag;
1229: /*
1230: Assumes new rows are same length as the old rows,hence bug!
1231: */
1232: for (i=0; i<ismax; i++) {
1233: mat = (Mat_SeqAIJ *)(submats[i]->data);
1234: if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1235: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1236: }
1237: PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(PetscInt),&flag);
1238: if (flag == PETSC_FALSE) {
1239: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1240: }
1241: /* Initial matrix as if empty */
1242: PetscMemzero(mat->ilen,submats[i]->m*sizeof(PetscInt));
1243: submats[i]->factor = C->factor;
1244: }
1245: } else {
1246: for (i=0; i<ismax; i++) {
1247: MatCreate(PETSC_COMM_SELF,nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE,submats+i);
1248: MatSetType(submats[i],A->type_name);
1249: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1250: }
1251: }
1253: /* Assemble the matrices */
1254: /* First assemble the local rows */
1255: {
1256: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1257: PetscScalar *imat_a;
1258:
1259: for (i=0; i<ismax; i++) {
1260: mat = (Mat_SeqAIJ*)submats[i]->data;
1261: imat_ilen = mat->ilen;
1262: imat_j = mat->j;
1263: imat_i = mat->i;
1264: imat_a = mat->a;
1265: cmap_i = cmap[i];
1266: rmap_i = rmap[i];
1267: irow_i = irow[i];
1268: jmax = nrow[i];
1269: for (l=0,j=0; j<jmax; j++) {
1270: row = irow_i[j];
1271: while (row >= c->rowners[l+1]) l++;
1272: proc = l;
1273: if (proc == rank) {
1274: old_row = row;
1275: row = rmap_i[row];
1276: ilen_row = imat_ilen[row];
1277: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1278: mat_i = imat_i[row] ;
1279: mat_a = imat_a + mat_i;
1280: mat_j = imat_j + mat_i;
1281: for (k=0; k<ncols; k++) {
1282: if ((tcol = cmap_i[cols[k]])) {
1283: *mat_j++ = tcol - 1;
1284: *mat_a++ = vals[k];
1285: ilen_row++;
1286: }
1287: }
1288: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1289: imat_ilen[row] = ilen_row;
1290: }
1291: }
1292: }
1293: }
1295: /* Now assemble the off proc rows*/
1296: {
1297: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1298: PetscInt *imat_j,*imat_i;
1299: PetscScalar *imat_a,*rbuf4_i;
1301: for (tmp2=0; tmp2<nrqs; tmp2++) {
1302: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1303: idex = pa[idex2];
1304: sbuf1_i = sbuf1[idex];
1305: jmax = sbuf1_i[0];
1306: ct1 = 2*jmax + 1;
1307: ct2 = 0;
1308: rbuf2_i = rbuf2[idex2];
1309: rbuf3_i = rbuf3[idex2];
1310: rbuf4_i = rbuf4[idex2];
1311: for (j=1; j<=jmax; j++) {
1312: is_no = sbuf1_i[2*j-1];
1313: rmap_i = rmap[is_no];
1314: cmap_i = cmap[is_no];
1315: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1316: imat_ilen = mat->ilen;
1317: imat_j = mat->j;
1318: imat_i = mat->i;
1319: imat_a = mat->a;
1320: max1 = sbuf1_i[2*j];
1321: for (k=0; k<max1; k++,ct1++) {
1322: row = sbuf1_i[ct1];
1323: row = rmap_i[row];
1324: ilen = imat_ilen[row];
1325: mat_i = imat_i[row] ;
1326: mat_a = imat_a + mat_i;
1327: mat_j = imat_j + mat_i;
1328: max2 = rbuf2_i[ct1];
1329: for (l=0; l<max2; l++,ct2++) {
1330: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1331: *mat_j++ = tcol - 1;
1332: *mat_a++ = rbuf4_i[ct2];
1333: ilen++;
1334: }
1335: }
1336: imat_ilen[row] = ilen;
1337: }
1338: }
1339: }
1340: }
1341: PetscFree(r_status4);
1342: PetscFree(r_waits4);
1343: MPI_Waitall(nrqr,s_waits4,s_status4);
1344: PetscFree(s_waits4);
1345: PetscFree(s_status4);
1347: /* Restore the indices */
1348: for (i=0; i<ismax; i++) {
1349: ISRestoreIndices(isrow[i],irow+i);
1350: ISRestoreIndices(iscol[i],icol+i);
1351: }
1353: /* Destroy allocated memory */
1354: PetscFree(irow);
1355: PetscFree(w1);
1356: PetscFree(pa);
1358: PetscFree(sbuf1);
1359: PetscFree(rbuf2);
1360: for (i=0; i<nrqr; ++i) {
1361: PetscFree(sbuf2[i]);
1362: }
1363: for (i=0; i<nrqs; ++i) {
1364: PetscFree(rbuf3[i]);
1365: PetscFree(rbuf4[i]);
1366: }
1368: PetscFree(sbuf2);
1369: PetscFree(rbuf3);
1370: PetscFree(rbuf4);
1371: PetscFree(sbuf_aj[0]);
1372: PetscFree(sbuf_aj);
1373: PetscFree(sbuf_aa[0]);
1374: PetscFree(sbuf_aa);
1375:
1376: PetscFree(cmap);
1377: PetscFree(rmap);
1378: PetscFree(lens);
1380: for (i=0; i<ismax; i++) {
1381: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1382: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1383: }
1384: return(0);
1385: }