Actual source code: mpiov.c
1: /*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include src/mat/impls/aij/mpi/mpiaij.h
7: #include petscbt.h
9: static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
10: static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
11: static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
12: EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13: EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
15: #undef __FUNCT__
17: int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS *is,int ov)
18: {
19: int i,ierr;
22: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23: for (i=0; i<ov; ++i) {
24: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
25: }
26: return(0);
27: }
29: /*
30: Sample message format:
31: If a processor A wants processor B to process some elements corresponding
32: to index sets is[1],is[5]
33: mesg [0] = 2 (no of index sets in the mesg)
34: -----------
35: mesg [1] = 1 => is[1]
36: mesg [2] = sizeof(is[1]);
37: -----------
38: mesg [3] = 5 => is[5]
39: mesg [4] = sizeof(is[5]);
40: -----------
41: mesg [5]
42: mesg [n] datas[1]
43: -----------
44: mesg[n+1]
45: mesg[m] data(is[5])
46: -----------
47:
48: Notes:
49: nrqs - no of requests sent (or to be sent out)
50: nrqr - no of requests recieved (which have to be or which have been processed
51: */
52: #undef __FUNCT__
54: static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS *is)
55: {
56: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
57: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
58: int size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
59: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
60: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
61: PetscBT *table;
62: MPI_Comm comm;
63: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
64: MPI_Status *s_status,*recv_status;
67: comm = C->comm;
68: size = c->size;
69: rank = c->rank;
70: m = C->M;
72: PetscObjectGetNewTag((PetscObject)C,&tag1);
73: PetscObjectGetNewTag((PetscObject)C,&tag2);
74:
75: len = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
76: ierr = PetscMalloc(len,&idx);
77: n = (int*)(idx + imax);
78: rtable = n + imax;
79:
80: for (i=0; i<imax; i++) {
81: ISGetIndices(is[i],&idx[i]);
82: ISGetLocalSize(is[i],&n[i]);
83: }
84:
85: /* Create hash table for the mapping :row -> proc*/
86: for (i=0,j=0; i<size; i++) {
87: len = c->rowners[i+1];
88: for (; j<len; j++) {
89: rtable[j] = i;
90: }
91: }
93: /* evaluate communication - mesg to who,length of mesg, and buffer space
94: required. Based on this, buffers are allocated, and data copied into them*/
95: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
96: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
97: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
98: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
99: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
100: for (i=0; i<imax; i++) {
101: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
102: idx_i = idx[i];
103: len = n[i];
104: for (j=0; j<len; j++) {
105: row = idx_i[j];
106: if (row < 0) {
107: SETERRQ(1,"Index set cannot have negative entries");
108: }
109: proc = rtable[row];
110: w4[proc]++;
111: }
112: for (j=0; j<size; j++){
113: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114: }
115: }
117: nrqs = 0; /* no of outgoing messages */
118: msz = 0; /* total mesg length (for all proc */
119: w1[rank] = 0; /* no mesg sent to intself */
120: w3[rank] = 0;
121: for (i=0; i<size; i++) {
122: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123: }
124: /* pa - is list of processors to communicate with */
125: PetscMalloc((nrqs+1)*sizeof(int),&pa);
126: for (i=0,j=0; i<size; i++) {
127: if (w1[i]) {pa[j] = i; j++;}
128: }
130: /* Each message would have a header = 1 + 2*(no of IS) + data */
131: for (i=0; i<nrqs; i++) {
132: j = pa[i];
133: w1[j] += w2[j] + 2*w3[j];
134: msz += w1[j];
135: }
137: /* Determine the number of messages to expect, their lengths, from from-ids */
138: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
139: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
141: /* Now post the Irecvs corresponding to these messages */
142: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
144: /* Allocate Memory for outgoing messages */
145: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
146: PetscMalloc(len,&outdat);
147: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
148: PetscMemzero(outdat,2*size*sizeof(int*));
149: tmp = (int*)(outdat + 2*size);
150: ctr = tmp + msz;
152: {
153: int *iptr = tmp,ict = 0;
154: for (i=0; i<nrqs; i++) {
155: j = pa[i];
156: iptr += ict;
157: outdat[j] = iptr;
158: ict = w1[j];
159: }
160: }
162: /* Form the outgoing messages */
163: /*plug in the headers*/
164: for (i=0; i<nrqs; i++) {
165: j = pa[i];
166: outdat[j][0] = 0;
167: ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
168: ptr[j] = outdat[j] + 2*w3[j] + 1;
169: }
170:
171: /* Memory for doing local proc's work*/
172: {
173: int *d_p;
174: char *t_p;
176: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
177: (m)*imax*sizeof(int) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
178: ierr = PetscMalloc(len,&table);
179: ierr = PetscMemzero(table,len);
180: data = (int **)(table + imax);
181: isz = (int *)(data + imax);
182: d_p = (int *)(isz + imax);
183: t_p = (char *)(d_p + m*imax);
184: for (i=0; i<imax; i++) {
185: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186: data[i] = d_p + (m)*i;
187: }
188: }
190: /* Parse the IS and update local tables and the outgoing buf with the data*/
191: {
192: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
193: PetscBT table_i;
195: for (i=0; i<imax; i++) {
196: ierr = PetscMemzero(ctr,size*sizeof(int));
197: n_i = n[i];
198: table_i = table[i];
199: idx_i = idx[i];
200: data_i = data[i];
201: isz_i = isz[i];
202: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
203: row = idx_i[j];
204: proc = rtable[row];
205: if (proc != rank) { /* copy to the outgoing buffer */
206: ctr[proc]++;
207: *ptr[proc] = row;
208: ptr[proc]++;
209: } else { /* Update the local table */
210: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211: }
212: }
213: /* Update the headers for the current IS */
214: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215: if ((ctr_j = ctr[j])) {
216: outdat_j = outdat[j];
217: k = ++outdat_j[0];
218: outdat_j[2*k] = ctr_j;
219: outdat_j[2*k-1] = i;
220: }
221: }
222: isz[i] = isz_i;
223: }
224: }
225:
228: /* Now post the sends */
229: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
230: for (i=0; i<nrqs; ++i) {
231: j = pa[i];
232: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
233: }
234:
235: /* No longer need the original indices*/
236: for (i=0; i<imax; ++i) {
237: ISRestoreIndices(is[i],idx+i);
238: }
239: PetscFree(idx);
241: for (i=0; i<imax; ++i) {
242: ISDestroy(is[i]);
243: }
244:
245: /* Do Local work*/
246: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
248: /* Receive messages*/
249: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
250: MPI_Waitall(nrqr,r_waits1,recv_status);
251:
252: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
253: MPI_Waitall(nrqs,s_waits1,s_status);
255: /* Phase 1 sends are complete - deallocate buffers */
256: PetscFree(outdat);
257: PetscFree(w1);
259: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
260: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
261: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
262: PetscFree(rbuf);
264:
265: /* Send the data back*/
266: /* Do a global reduction to know the buffer space req for incoming messages*/
267: {
268: int *rw1;
269:
270: PetscMalloc(size*sizeof(int),&rw1);
271: PetscMemzero(rw1,size*sizeof(int));
273: for (i=0; i<nrqr; ++i) {
274: proc = recv_status[i].MPI_SOURCE;
275: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
276: rw1[proc] = isz1[i];
277: }
278: PetscFree(onodes1);
279: PetscFree(olengths1);
281: /* Determine the number of messages to expect, their lengths, from from-ids */
282: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
283: PetscFree(rw1);
284: }
285: /* Now post the Irecvs corresponding to these messages */
286: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
288: /* Now post the sends */
289: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
290: for (i=0; i<nrqr; ++i) {
291: j = recv_status[i].MPI_SOURCE;
292: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
293: }
295: /* receive work done on other processors*/
296: {
297: int idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
298: PetscBT table_i;
299: MPI_Status *status2;
300:
301: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
302: for (i=0; i<nrqs; ++i) {
303: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
304: /* Process the message*/
305: rbuf2_i = rbuf2[idex];
306: ct1 = 2*rbuf2_i[0]+1;
307: jmax = rbuf2[idex][0];
308: for (j=1; j<=jmax; j++) {
309: max = rbuf2_i[2*j];
310: is_no = rbuf2_i[2*j-1];
311: isz_i = isz[is_no];
312: data_i = data[is_no];
313: table_i = table[is_no];
314: for (k=0; k<max; k++,ct1++) {
315: row = rbuf2_i[ct1];
316: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317: }
318: isz[is_no] = isz_i;
319: }
320: }
322: MPI_Waitall(nrqr,s_waits2,status2);
323: PetscFree(status2);
324: }
325:
326: for (i=0; i<imax; ++i) {
327: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
328: }
329:
330: PetscFree(onodes2);
331: PetscFree(olengths2);
333: PetscFree(pa);
334: PetscFree(rbuf2);
335: PetscFree(s_waits1);
336: PetscFree(r_waits1);
337: PetscFree(s_waits2);
338: PetscFree(r_waits2);
339: PetscFree(table);
340: PetscFree(s_status);
341: PetscFree(recv_status);
342: PetscFree(xdata[0]);
343: PetscFree(xdata);
344: PetscFree(isz1);
345: return(0);
346: }
348: #undef __FUNCT__
350: /*
351: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
352: the work on the local processor.
354: Inputs:
355: C - MAT_MPIAIJ;
356: imax - total no of index sets processed at a time;
357: table - an array of char - size = m bits.
358:
359: Output:
360: isz - array containing the count of the solution elements correspondign
361: to each index set;
362: data - pointer to the solutions
363: */
364: static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
365: {
366: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
367: Mat A = c->A,B = c->B;
368: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
369: int start,end,val,max,rstart,cstart,ashift,bshift,*ai,*aj;
370: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
371: PetscBT table_i;
374: rstart = c->rstart;
375: cstart = c->cstart;
376: ashift = a->indexshift;
377: ai = a->i;
378: aj = a->j +ashift;
379: bshift = b->indexshift;
380: bi = b->i;
381: bj = b->j +bshift;
382: garray = c->garray;
384:
385: for (i=0; i<imax; i++) {
386: data_i = data[i];
387: table_i = table[i];
388: isz_i = isz[i];
389: for (j=0,max=isz[i]; j<max; j++) {
390: row = data_i[j] - rstart;
391: start = ai[row];
392: end = ai[row+1];
393: for (k=start; k<end; k++) { /* Amat */
394: val = aj[k] + ashift + cstart;
395: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
396: }
397: start = bi[row];
398: end = bi[row+1];
399: for (k=start; k<end; k++) { /* Bmat */
400: val = garray[bj[k]+bshift];
401: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
402: }
403: }
404: isz[i] = isz_i;
405: }
406: return(0);
407: }
409: #undef __FUNCT__
411: /*
412: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
413: and return the output
415: Input:
416: C - the matrix
417: nrqr - no of messages being processed.
418: rbuf - an array of pointers to the recieved requests
419:
420: Output:
421: xdata - array of messages to be sent back
422: isz1 - size of each message
424: For better efficiency perhaps we should malloc seperately each xdata[i],
425: then if a remalloc is required we need only copy the data for that one row
426: rather then all previous rows as it is now where a single large chunck of
427: memory is used.
429: */
430: static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
431: {
432: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
433: Mat A = c->A,B = c->B;
434: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
435: int rstart,cstart,ashift,bshift,*ai,*aj,*bi,*bj,*garray,i,j,k;
436: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
437: int val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
438: int *rbuf_i,kmax,rbuf_0,ierr;
439: PetscBT xtable;
442: rank = c->rank;
443: m = C->M;
444: rstart = c->rstart;
445: cstart = c->cstart;
446: ashift = a->indexshift;
447: ai = a->i;
448: aj = a->j +ashift;
449: bshift = b->indexshift;
450: bi = b->i;
451: bj = b->j +bshift;
452: garray = c->garray;
453:
454:
455: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
456: rbuf_i = rbuf[i];
457: rbuf_0 = rbuf_i[0];
458: ct += rbuf_0;
459: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
460: }
461:
462: if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
463: else max1 = 1;
464: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
465: ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
466: ++no_malloc;
467: ierr = PetscBTCreate(m,xtable);
468: ierr = PetscMemzero(isz1,nrqr*sizeof(int));
469:
470: ct3 = 0;
471: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
472: rbuf_i = rbuf[i];
473: rbuf_0 = rbuf_i[0];
474: ct1 = 2*rbuf_0+1;
475: ct2 = ct1;
476: ct3 += ct1;
477: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
478: PetscBTMemzero(m,xtable);
479: oct2 = ct2;
480: kmax = rbuf_i[2*j];
481: for (k=0; k<kmax; k++,ct1++) {
482: row = rbuf_i[ct1];
483: if (!PetscBTLookupSet(xtable,row)) {
484: if (!(ct3 < mem_estimate)) {
485: new_estimate = (int)(1.5*mem_estimate)+1;
486: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
487: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
488: ierr = PetscFree(xdata[0]);
489: xdata[0] = tmp;
490: mem_estimate = new_estimate; ++no_malloc;
491: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
492: }
493: xdata[i][ct2++] = row;
494: ct3++;
495: }
496: }
497: for (k=oct2,max2=ct2; k<max2; k++) {
498: row = xdata[i][k] - rstart;
499: start = ai[row];
500: end = ai[row+1];
501: for (l=start; l<end; l++) {
502: val = aj[l] + ashift + cstart;
503: if (!PetscBTLookupSet(xtable,val)) {
504: if (!(ct3 < mem_estimate)) {
505: new_estimate = (int)(1.5*mem_estimate)+1;
506: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
507: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
508: ierr = PetscFree(xdata[0]);
509: xdata[0] = tmp;
510: mem_estimate = new_estimate; ++no_malloc;
511: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512: }
513: xdata[i][ct2++] = val;
514: ct3++;
515: }
516: }
517: start = bi[row];
518: end = bi[row+1];
519: for (l=start; l<end; l++) {
520: val = garray[bj[l]+bshift];
521: if (!PetscBTLookupSet(xtable,val)) {
522: if (!(ct3 < mem_estimate)) {
523: new_estimate = (int)(1.5*mem_estimate)+1;
524: ierr = PetscMalloc(new_estimate*sizeof(int),&tmp);
525: ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
526: PetscFree(xdata[0]);
527: xdata[0] = tmp;
528: mem_estimate = new_estimate; ++no_malloc;
529: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
530: }
531: xdata[i][ct2++] = val;
532: ct3++;
533: }
534: }
535: }
536: /* Update the header*/
537: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
538: xdata[i][2*j-1] = rbuf_i[2*j-1];
539: }
540: xdata[i][0] = rbuf_0;
541: xdata[i+1] = xdata[i] + ct2;
542: isz1[i] = ct2; /* size of each message */
543: }
544: PetscBTDestroy(xtable);
545: PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %dn",rank,mem_estimate, ct3,no_malloc);
546: return(0);
547: }
548: /* -------------------------------------------------------------------------*/
549: EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,IS*,IS*,MatReuse,Mat*);
550: EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
551: /*
552: Every processor gets the entire matrix
553: */
554: #undef __FUNCT__
556: int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat **Bin)
557: {
558: Mat B;
559: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
560: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
561: int ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
562: int m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
563: PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
566: MPI_Comm_size(A->comm,&size);
567: MPI_Comm_rank(A->comm,&rank);
569: if (scall == MAT_INITIAL_MATRIX) {
570: /* ----------------------------------------------------------------
571: Tell every processor the number of nonzeros per row
572: */
573: PetscMalloc(A->M*sizeof(int),&lens);
574: for (i=a->rstart; i<a->rend; i++) {
575: lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
576: }
577: sendcount = a->rend - a->rstart;
578: PetscMalloc(2*size*sizeof(int),&recvcounts);
579: displs = recvcounts + size;
580: for (i=0; i<size; i++) {
581: recvcounts[i] = a->rowners[i+1] - a->rowners[i];
582: displs[i] = a->rowners[i];
583: }
584: ierr = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);
586: /* ---------------------------------------------------------------
587: Create the sequential matrix
588: */
589: ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,A->M,A->N,0,lens,&B);
590: ierr = PetscMalloc(sizeof(Mat),Bin);
591: **Bin = B;
592: b = (Mat_SeqAIJ *)B->data;
594: /*--------------------------------------------------------------------
595: Copy my part of matrix column indices over
596: */
597: sendcount = ad->nz + bd->nz;
598: jsendbuf = b->j + b->i[rstarts[rank]];
599: a_jsendbuf = ad->j;
600: b_jsendbuf = bd->j;
601: n = a->rend - a->rstart;
602: cnt = 0;
603: for (i=0; i<n; i++) {
605: /* put in lower diagonal portion */
606: m = bd->i[i+1] - bd->i[i];
607: while (m > 0) {
608: /* is it above diagonal (in bd (compressed) numbering) */
609: if (garray[*b_jsendbuf] > a->rstart + i) break;
610: jsendbuf[cnt++] = garray[*b_jsendbuf++];
611: m--;
612: }
614: /* put in diagonal portion */
615: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
616: jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
617: }
619: /* put in upper diagonal portion */
620: while (m-- > 0) {
621: jsendbuf[cnt++] = garray[*b_jsendbuf++];
622: }
623: }
624: if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
626: /*--------------------------------------------------------------------
627: Gather all column indices to all processors
628: */
629: for (i=0; i<size; i++) {
630: recvcounts[i] = 0;
631: for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
632: recvcounts[i] += lens[j];
633: }
634: }
635: displs[0] = 0;
636: for (i=1; i<size; i++) {
637: displs[i] = displs[i-1] + recvcounts[i-1];
638: }
639: MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);
641: /*--------------------------------------------------------------------
642: Assemble the matrix into useable form (note numerical values not yet set)
643: */
644: /* set the b->ilen (length of each row) values */
645: PetscMemcpy(b->ilen,lens,A->M*sizeof(int));
646: /* set the b->i indices */
647: b->i[0] = 0;
648: for (i=1; i<=A->M; i++) {
649: b->i[i] = b->i[i-1] + lens[i-1];
650: }
651: PetscFree(lens);
652: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
653: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
654: } else {
655: B = **Bin;
656: b = (Mat_SeqAIJ *)B->data;
657: }
659: /*--------------------------------------------------------------------
660: Copy my part of matrix numerical values into the values location
661: */
662: sendcount = ad->nz + bd->nz;
663: sendbuf = b->a + b->i[rstarts[rank]];
664: a_sendbuf = ad->a;
665: b_sendbuf = bd->a;
666: b_sendj = bd->j;
667: n = a->rend - a->rstart;
668: cnt = 0;
669: for (i=0; i<n; i++) {
671: /* put in lower diagonal portion */
672: m = bd->i[i+1] - bd->i[i];
673: while (m > 0) {
674: /* is it above diagonal (in bd (compressed) numbering) */
675: if (garray[*b_sendj] > a->rstart + i) break;
676: sendbuf[cnt++] = *b_sendbuf++;
677: m--;
678: b_sendj++;
679: }
681: /* put in diagonal portion */
682: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
683: sendbuf[cnt++] = *a_sendbuf++;
684: }
686: /* put in upper diagonal portion */
687: while (m-- > 0) {
688: sendbuf[cnt++] = *b_sendbuf++;
689: b_sendj++;
690: }
691: }
692: if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
693:
694: /* -----------------------------------------------------------------
695: Gather all numerical values to all processors
696: */
697: if (!recvcounts) {
698: ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);
699: displs = recvcounts + size;
700: }
701: for (i=0; i<size; i++) {
702: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
703: }
704: displs[0] = 0;
705: for (i=1; i<size; i++) {
706: displs[i] = displs[i-1] + recvcounts[i-1];
707: }
708: recvbuf = b->a;
709: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);
710: PetscFree(recvcounts);
712: return(0);
713: }
715: #undef __FUNCT__
717: int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
718: {
719: int nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
720: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
723: /*
724: Check for special case each processor gets entire matrix
725: */
726: if (ismax == 1 && C->M == C->N) {
727: ISIdentity(*isrow,&rowflag);
728: ISIdentity(*iscol,&colflag);
729: ISGetLocalSize(*isrow,&nrow);
730: ISGetLocalSize(*iscol,&ncol);
731: if (rowflag && colflag && nrow == C->M && ncol == C->N) {
732: wantallmatrix = PETSC_TRUE;
733: PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
734: }
735: }
736: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);
737: if (twantallmatrix) {
738: MatGetSubMatrix_MPIAIJ_All(C,scall,submat);
739: return(0);
740: }
742: /* Allocate memory to hold all the submatrices */
743: if (scall != MAT_REUSE_MATRIX) {
744: PetscMalloc((ismax+1)*sizeof(Mat),submat);
745: }
746: /* Determine the number of stages through which submatrices are done */
747: nmax = 20*1000000 / (C->N * sizeof(int));
748: if (!nmax) nmax = 1;
749: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
751: /* Make sure every processor loops through the nstages */
752: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
754: for (i=0,pos=0; i<nstages; i++) {
755: if (pos+nmax <= ismax) max_no = nmax;
756: else if (pos == ismax) max_no = 0;
757: else max_no = ismax-pos;
758: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
759: pos += max_no;
760: }
761: return(0);
762: }
763: /* -------------------------------------------------------------------------*/
764: #undef __FUNCT__
766: int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
767: {
768: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
769: Mat A = c->A;
770: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
771: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
772: int **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
773: int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
774: int **rbuf3,*req_source,**sbuf_aj,ashift,**rbuf2,max1,max2,**rmap;
775: int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
776: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
777: int *rmap_i,tag0,tag1,tag2,tag3;
778: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
779: MPI_Request *r_waits4,*s_waits3,*s_waits4;
780: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
781: MPI_Status *r_status3,*r_status4,*s_status4;
782: MPI_Comm comm;
783: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
784: PetscTruth sorted;
785: int *onodes1,*olengths1;
788: comm = C->comm;
789: tag0 = C->tag;
790: size = c->size;
791: rank = c->rank;
792: m = C->M;
793: ashift = a->indexshift;
794:
795: /* Get some new tags to keep the communication clean */
796: PetscObjectGetNewTag((PetscObject)C,&tag1);
797: PetscObjectGetNewTag((PetscObject)C,&tag2);
798: PetscObjectGetNewTag((PetscObject)C,&tag3);
800: /* Check if the col indices are sorted */
801: for (i=0; i<ismax; i++) {
802: ISSorted(isrow[i],&sorted);
803: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
804: ISSorted(iscol[i],&sorted);
805: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
806: }
808: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
809: ierr = PetscMalloc(len,&irow);
810: icol = irow + ismax;
811: nrow = (int*)(icol + ismax);
812: ncol = nrow + ismax;
813: rtable = ncol + ismax;
815: for (i=0; i<ismax; i++) {
816: ISGetIndices(isrow[i],&irow[i]);
817: ISGetIndices(iscol[i],&icol[i]);
818: ISGetLocalSize(isrow[i],&nrow[i]);
819: ISGetLocalSize(iscol[i],&ncol[i]);
820: }
822: /* Create hash table for the mapping :row -> proc*/
823: for (i=0,j=0; i<size; i++) {
824: jmax = c->rowners[i+1];
825: for (; j<jmax; j++) {
826: rtable[j] = i;
827: }
828: }
830: /* evaluate communication - mesg to who, length of mesg, and buffer space
831: required. Based on this, buffers are allocated, and data copied into them*/
832: ierr = PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
833: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
834: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
835: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
836: ierr = PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector*/
837: for (i=0; i<ismax; i++) {
838: ierr = PetscMemzero(w4,size*sizeof(int)); /* initialize work vector*/
839: jmax = nrow[i];
840: irow_i = irow[i];
841: for (j=0; j<jmax; j++) {
842: row = irow_i[j];
843: proc = rtable[row];
844: w4[proc]++;
845: }
846: for (j=0; j<size; j++) {
847: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
848: }
849: }
850:
851: nrqs = 0; /* no of outgoing messages */
852: msz = 0; /* total mesg length (for all procs) */
853: w1[rank] = 0; /* no mesg sent to self */
854: w3[rank] = 0;
855: for (i=0; i<size; i++) {
856: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
857: }
858: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
859: for (i=0,j=0; i<size; i++) {
860: if (w1[i]) { pa[j] = i; j++; }
861: }
863: /* Each message would have a header = 1 + 2*(no of IS) + data */
864: for (i=0; i<nrqs; i++) {
865: j = pa[i];
866: w1[j] += w2[j] + 2* w3[j];
867: msz += w1[j];
868: }
870: /* Determine the number of messages to expect, their lengths, from from-ids */
871: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
872: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
874: /* Now post the Irecvs corresponding to these messages */
875: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
876:
877: PetscFree(onodes1);
878: PetscFree(olengths1);
879:
880: /* Allocate Memory for outgoing messages */
881: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
882: ierr = PetscMalloc(len,&sbuf1);
883: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
884: ierr = PetscMemzero(sbuf1,2*size*sizeof(int*));
885: /* allocate memory for outgoing data + buf to receive the first reply */
886: tmp = (int*)(ptr + size);
887: ctr = tmp + 2*msz;
889: {
890: int *iptr = tmp,ict = 0;
891: for (i=0; i<nrqs; i++) {
892: j = pa[i];
893: iptr += ict;
894: sbuf1[j] = iptr;
895: ict = w1[j];
896: }
897: }
899: /* Form the outgoing messages */
900: /* Initialize the header space */
901: for (i=0; i<nrqs; i++) {
902: j = pa[i];
903: sbuf1[j][0] = 0;
904: ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
905: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
906: }
907:
908: /* Parse the isrow and copy data into outbuf */
909: for (i=0; i<ismax; i++) {
910: ierr = PetscMemzero(ctr,size*sizeof(int));
911: irow_i = irow[i];
912: jmax = nrow[i];
913: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
914: row = irow_i[j];
915: proc = rtable[row];
916: if (proc != rank) { /* copy to the outgoing buf*/
917: ctr[proc]++;
918: *ptr[proc] = row;
919: ptr[proc]++;
920: }
921: }
922: /* Update the headers for the current IS */
923: for (j=0; j<size; j++) { /* Can Optimise this loop too */
924: if ((ctr_j = ctr[j])) {
925: sbuf1_j = sbuf1[j];
926: k = ++sbuf1_j[0];
927: sbuf1_j[2*k] = ctr_j;
928: sbuf1_j[2*k-1] = i;
929: }
930: }
931: }
933: /* Now post the sends */
934: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
935: for (i=0; i<nrqs; ++i) {
936: j = pa[i];
937: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
938: }
940: /* Post Receives to capture the buffer size */
941: ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
942: ierr = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
943: rbuf2[0] = tmp + msz;
944: for (i=1; i<nrqs; ++i) {
945: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
946: }
947: for (i=0; i<nrqs; ++i) {
948: j = pa[i];
949: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
950: }
952: /* Send to other procs the buf size they should allocate */
953:
955: /* Receive messages*/
956: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
957: ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
958: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
959: ierr = PetscMalloc(len,&sbuf2);
960: req_size = (int*)(sbuf2 + nrqr);
961: req_source = req_size + nrqr;
962:
963: {
964: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
965: int *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
966: int *sbuf2_i;
968: for (i=0; i<nrqr; ++i) {
969: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
970: req_size[idex] = 0;
971: rbuf1_i = rbuf1[idex];
972: start = 2*rbuf1_i[0] + 1;
973: ierr = MPI_Get_count(r_status1+i,MPI_INT,&end);
974: ierr = PetscMalloc((end+1)*sizeof(int),&sbuf2[idex]);
975: sbuf2_i = sbuf2[idex];
976: for (j=start; j<end; j++) {
977: id = rbuf1_i[j] - rstart;
978: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
979: sbuf2_i[j] = ncols;
980: req_size[idex] += ncols;
981: }
982: req_source[idex] = r_status1[i].MPI_SOURCE;
983: /* form the header */
984: sbuf2_i[0] = req_size[idex];
985: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
986: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);
987: }
988: }
989: PetscFree(r_status1);
990: PetscFree(r_waits1);
992: /* recv buffer sizes */
993: /* Receive messages*/
994:
995: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
996: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
997: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
998: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
999: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1001: for (i=0; i<nrqs; ++i) {
1002: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1003: PetscMalloc((rbuf2[idex][0]+1)*sizeof(int),&rbuf3[idex]);
1004: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1005: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1006: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1007: }
1008: PetscFree(r_status2);
1009: PetscFree(r_waits2);
1010:
1011: /* Wait on sends1 and sends2 */
1012: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1013: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1015: MPI_Waitall(nrqs,s_waits1,s_status1);
1016: MPI_Waitall(nrqr,s_waits2,s_status2);
1017: PetscFree(s_status1);
1018: PetscFree(s_status2);
1019: PetscFree(s_waits1);
1020: PetscFree(s_waits2);
1022: /* Now allocate buffers for a->j, and send them off */
1023: PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);
1024: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1025: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
1026: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1027:
1028: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1029: {
1030: int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1031: int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1032: int *a_j = a->j,*b_j = b->j,shift = a->indexshift,ctmp,*t_cols;
1034: for (i=0; i<nrqr; i++) {
1035: rbuf1_i = rbuf1[i];
1036: sbuf_aj_i = sbuf_aj[i];
1037: ct1 = 2*rbuf1_i[0] + 1;
1038: ct2 = 0;
1039: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1040: kmax = rbuf1[i][2*j];
1041: for (k=0; k<kmax; k++,ct1++) {
1042: row = rbuf1_i[ct1] - rstart;
1043: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1044: ncols = nzA + nzB;
1045: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1047: /* load the column indices for this row into cols*/
1048: cols = sbuf_aj_i + ct2;
1049: if (!shift) {
1050: for (l=0; l<nzB; l++) {
1051: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1052: else break;
1053: }
1054: imark = l;
1055: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1056: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1057: } else {
1058: MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);
1059: PetscMemcpy(cols,t_cols,ncols*sizeof(int));
1060: MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);
1061: }
1063: ct2 += ncols;
1064: }
1065: }
1066: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
1067: }
1068: }
1069: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1070: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1072: /* Allocate buffers for a->a, and send them off */
1073: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1074: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1075: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1076: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1077:
1078: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1079: {
1080: int nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1081: int cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1082: int *b_j = b->j,shift = a->indexshift;
1083: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*t_vals;
1084:
1085: for (i=0; i<nrqr; i++) {
1086: rbuf1_i = rbuf1[i];
1087: sbuf_aa_i = sbuf_aa[i];
1088: ct1 = 2*rbuf1_i[0]+1;
1089: ct2 = 0;
1090: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1091: kmax = rbuf1_i[2*j];
1092: for (k=0; k<kmax; k++,ct1++) {
1093: row = rbuf1_i[ct1] - rstart;
1094: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1095: ncols = nzA + nzB;
1096: cworkB = b_j + b_i[row];
1097: vworkA = a_a + a_i[row];
1098: vworkB = b_a + b_i[row];
1100: /* load the column values for this row into vals*/
1101: vals = sbuf_aa_i+ct2;
1102: if (!shift) {
1103: for (l=0; l<nzB; l++) {
1104: if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l];
1105: else break;
1106: }
1107: imark = l;
1108: for (l=0; l<nzA; l++) vals[imark+l] = vworkA[l];
1109: for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1110: } else {
1111: MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);
1112: PetscMemcpy(vals,t_vals,ncols*sizeof(PetscScalar));
1113: MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);
1114: }
1115: ct2 += ncols;
1116: }
1117: }
1118: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1119: }
1120: }
1121: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1122: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1123: PetscFree(rbuf1);
1125: /* Form the matrix */
1126: /* create col map */
1127: {
1128: int *icol_i;
1129:
1130: len = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1131: ierr = PetscMalloc(len,&cmap);
1132: cmap[0] = (int *)(cmap + ismax);
1133: ierr = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));
1134: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1135: for (i=0; i<ismax; i++) {
1136: jmax = ncol[i];
1137: icol_i = icol[i];
1138: cmap_i = cmap[i];
1139: for (j=0; j<jmax; j++) {
1140: cmap_i[icol_i[j]] = j+1;
1141: }
1142: }
1143: }
1145: /* Create lens which is required for MatCreate... */
1146: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1147: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1148: ierr = PetscMalloc(len,&lens);
1149: lens[0] = (int *)(lens + ismax);
1150: ierr = PetscMemzero(lens[0],j*sizeof(int));
1151: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1152:
1153: /* Update lens from local data */
1154: for (i=0; i<ismax; i++) {
1155: jmax = nrow[i];
1156: cmap_i = cmap[i];
1157: irow_i = irow[i];
1158: lens_i = lens[i];
1159: for (j=0; j<jmax; j++) {
1160: row = irow_i[j];
1161: proc = rtable[row];
1162: if (proc == rank) {
1163: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1164: for (k=0; k<ncols; k++) {
1165: if (cmap_i[cols[k]]) { lens_i[j]++;}
1166: }
1167: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1168: }
1169: }
1170: }
1171:
1172: /* Create row map*/
1173: len = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1174: ierr = PetscMalloc(len,&rmap);
1175: rmap[0] = (int *)(rmap + ismax);
1176: ierr = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));
1177: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1178: for (i=0; i<ismax; i++) {
1179: rmap_i = rmap[i];
1180: irow_i = irow[i];
1181: jmax = nrow[i];
1182: for (j=0; j<jmax; j++) {
1183: rmap_i[irow_i[j]] = j;
1184: }
1185: }
1186:
1187: /* Update lens from offproc data */
1188: {
1189: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1191: for (tmp2=0; tmp2<nrqs; tmp2++) {
1192: MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1193: idex = pa[i];
1194: sbuf1_i = sbuf1[idex];
1195: jmax = sbuf1_i[0];
1196: ct1 = 2*jmax+1;
1197: ct2 = 0;
1198: rbuf2_i = rbuf2[i];
1199: rbuf3_i = rbuf3[i];
1200: for (j=1; j<=jmax; j++) {
1201: is_no = sbuf1_i[2*j-1];
1202: max1 = sbuf1_i[2*j];
1203: lens_i = lens[is_no];
1204: cmap_i = cmap[is_no];
1205: rmap_i = rmap[is_no];
1206: for (k=0; k<max1; k++,ct1++) {
1207: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1208: max2 = rbuf2_i[ct1];
1209: for (l=0; l<max2; l++,ct2++) {
1210: if (cmap_i[rbuf3_i[ct2]]) {
1211: lens_i[row]++;
1212: }
1213: }
1214: }
1215: }
1216: }
1217: }
1218: PetscFree(r_status3);
1219: PetscFree(r_waits3);
1220: MPI_Waitall(nrqr,s_waits3,s_status3);
1221: PetscFree(s_status3);
1222: PetscFree(s_waits3);
1224: /* Create the submatrices */
1225: if (scall == MAT_REUSE_MATRIX) {
1226: PetscTruth flag;
1228: /*
1229: Assumes new rows are same length as the old rows,hence bug!
1230: */
1231: for (i=0; i<ismax; i++) {
1232: mat = (Mat_SeqAIJ *)(submats[i]->data);
1233: if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1234: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1235: }
1236: PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);
1237: if (flag == PETSC_FALSE) {
1238: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1239: }
1240: /* Initial matrix as if empty */
1241: PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));
1242: submats[i]->factor = C->factor;
1243: }
1244: } else {
1245: for (i=0; i<ismax; i++) {
1246: MatCreateSeqAIJ(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);
1247: }
1248: }
1250: /* Assemble the matrices */
1251: /* First assemble the local rows */
1252: {
1253: int ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1254: PetscScalar *imat_a;
1255:
1256: for (i=0; i<ismax; i++) {
1257: mat = (Mat_SeqAIJ*)submats[i]->data;
1258: imat_ilen = mat->ilen;
1259: imat_j = mat->j;
1260: imat_i = mat->i;
1261: imat_a = mat->a;
1262: cmap_i = cmap[i];
1263: rmap_i = rmap[i];
1264: irow_i = irow[i];
1265: jmax = nrow[i];
1266: for (j=0; j<jmax; j++) {
1267: row = irow_i[j];
1268: proc = rtable[row];
1269: if (proc == rank) {
1270: old_row = row;
1271: row = rmap_i[row];
1272: ilen_row = imat_ilen[row];
1273: ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1274: mat_i = imat_i[row] + ashift;
1275: mat_a = imat_a + mat_i;
1276: mat_j = imat_j + mat_i;
1277: for (k=0; k<ncols; k++) {
1278: if ((tcol = cmap_i[cols[k]])) {
1279: *mat_j++ = tcol - (!ashift);
1280: *mat_a++ = vals[k];
1281: ilen_row++;
1282: }
1283: }
1284: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1285: imat_ilen[row] = ilen_row;
1286: }
1287: }
1288: }
1289: }
1291: /* Now assemble the off proc rows*/
1292: {
1293: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1294: int *imat_j,*imat_i;
1295: PetscScalar *imat_a,*rbuf4_i;
1297: for (tmp2=0; tmp2<nrqs; tmp2++) {
1298: MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1299: idex = pa[i];
1300: sbuf1_i = sbuf1[idex];
1301: jmax = sbuf1_i[0];
1302: ct1 = 2*jmax + 1;
1303: ct2 = 0;
1304: rbuf2_i = rbuf2[i];
1305: rbuf3_i = rbuf3[i];
1306: rbuf4_i = rbuf4[i];
1307: for (j=1; j<=jmax; j++) {
1308: is_no = sbuf1_i[2*j-1];
1309: rmap_i = rmap[is_no];
1310: cmap_i = cmap[is_no];
1311: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1312: imat_ilen = mat->ilen;
1313: imat_j = mat->j;
1314: imat_i = mat->i;
1315: imat_a = mat->a;
1316: max1 = sbuf1_i[2*j];
1317: for (k=0; k<max1; k++,ct1++) {
1318: row = sbuf1_i[ct1];
1319: row = rmap_i[row];
1320: ilen = imat_ilen[row];
1321: mat_i = imat_i[row] + ashift;
1322: mat_a = imat_a + mat_i;
1323: mat_j = imat_j + mat_i;
1324: max2 = rbuf2_i[ct1];
1325: for (l=0; l<max2; l++,ct2++) {
1326: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1327: *mat_j++ = tcol - (!ashift);
1328: *mat_a++ = rbuf4_i[ct2];
1329: ilen++;
1330: }
1331: }
1332: imat_ilen[row] = ilen;
1333: }
1334: }
1335: }
1336: }
1337: PetscFree(r_status4);
1338: PetscFree(r_waits4);
1339: MPI_Waitall(nrqr,s_waits4,s_status4);
1340: PetscFree(s_waits4);
1341: PetscFree(s_status4);
1343: /* Restore the indices */
1344: for (i=0; i<ismax; i++) {
1345: ISRestoreIndices(isrow[i],irow+i);
1346: ISRestoreIndices(iscol[i],icol+i);
1347: }
1349: /* Destroy allocated memory */
1350: PetscFree(irow);
1351: PetscFree(w1);
1352: PetscFree(pa);
1354: PetscFree(sbuf1);
1355: PetscFree(rbuf2);
1356: for (i=0; i<nrqr; ++i) {
1357: PetscFree(sbuf2[i]);
1358: }
1359: for (i=0; i<nrqs; ++i) {
1360: PetscFree(rbuf3[i]);
1361: PetscFree(rbuf4[i]);
1362: }
1364: PetscFree(sbuf2);
1365: PetscFree(rbuf3);
1366: PetscFree(rbuf4);
1367: PetscFree(sbuf_aj[0]);
1368: PetscFree(sbuf_aj);
1369: PetscFree(sbuf_aa[0]);
1370: PetscFree(sbuf_aa);
1371:
1372: PetscFree(cmap);
1373: PetscFree(rmap);
1374: PetscFree(lens);
1376: for (i=0; i<ismax; i++) {
1377: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1378: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1379: }
1380: return(0);
1381: }