Actual source code: baijov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include src/mat/impls/baij/mpi/mpibaij.h
6: #include petscbt.h
8: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat,PetscInt,IS *);
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
11: EXTERN PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: EXTERN PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
17: {
19: PetscInt i,N=C->N, bs=C->bs;
20: IS *is_new;
23: PetscMalloc(imax*sizeof(IS),&is_new);
24: /* Convert the indices into block format */
25: ISCompressIndicesGeneral(N,bs,imax,is,is_new);
26: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
27: for (i=0; i<ov; ++i) {
28: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
29: }
30: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
31: ISExpandIndicesGeneral(N,bs,imax,is_new,is);
32: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
33: PetscFree(is_new);
34: return(0);
35: }
37: /*
38: Sample message format:
39: If a processor A wants processor B to process some elements corresponding
40: to index sets is[1], is[5]
41: mesg [0] = 2 (no of index sets in the mesg)
42: -----------
43: mesg [1] = 1 => is[1]
44: mesg [2] = sizeof(is[1]);
45: -----------
46: mesg [5] = 5 => is[5]
47: mesg [6] = sizeof(is[5]);
48: -----------
49: mesg [7]
50: mesg [n] data(is[1])
51: -----------
52: mesg[n+1]
53: mesg[m] data(is[5])
54: -----------
55:
56: Notes:
57: nrqs - no of requests sent (or to be sent out)
58: nrqr - no of requests recieved (which have to be or which have been processed
59: */
62: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
63: {
64: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
65: PetscInt **idx,*n,*w3,*w4,*rtable,**data,len,*idx_i;
67: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
68: PetscInt Mbs,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
69: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
70: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
71: PetscBT *table;
72: MPI_Comm comm;
73: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
74: MPI_Status *s_status,*recv_status;
77: comm = C->comm;
78: size = c->size;
79: rank = c->rank;
80: Mbs = c->Mbs;
82: PetscObjectGetNewTag((PetscObject)C,&tag1);
83: PetscObjectGetNewTag((PetscObject)C,&tag2);
85: len = (imax+1)*sizeof(PetscInt*)+ (imax + Mbs)*sizeof(PetscInt);
86: PetscMalloc(len,&idx);
87: n = (PetscInt*)(idx + imax);
88: rtable = n + imax;
89:
90: for (i=0; i<imax; i++) {
91: ISGetIndices(is[i],&idx[i]);
92: ISGetLocalSize(is[i],&n[i]);
93: }
94:
95: /* Create hash table for the mapping :row -> proc*/
96: for (i=0,j=0; i<size; i++) {
97: len = c->rowners[i+1];
98: for (; j<len; j++) {
99: rtable[j] = i;
100: }
101: }
103: /* evaluate communication - mesg to who,length of mesg, and buffer space
104: required. Based on this, buffers are allocated, and data copied into them*/
105: PetscMalloc(size*2*sizeof(PetscInt)+size*2*sizeof(PetscMPIInt),&w1);/* mesg size */
106: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
107: w3 = (PetscInt*) (w2 + size); /* no of IS that needs to be sent to proc i */
108: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
109: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
110: for (i=0; i<imax; i++) {
111: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
112: idx_i = idx[i];
113: len = n[i];
114: for (j=0; j<len; j++) {
115: row = idx_i[j];
116: if (row < 0) {
117: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
118: }
119: proc = rtable[row];
120: w4[proc]++;
121: }
122: for (j=0; j<size; j++){
123: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
124: }
125: }
127: nrqs = 0; /* no of outgoing messages */
128: msz = 0; /* total mesg length (for all proc */
129: w1[rank] = 0; /* no mesg sent to itself */
130: w3[rank] = 0;
131: for (i=0; i<size; i++) {
132: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
133: }
134: /* pa - is list of processors to communicate with */
135: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
136: for (i=0,j=0; i<size; i++) {
137: if (w1[i]) {pa[j] = i; j++;}
138: }
140: /* Each message would have a header = 1 + 2*(no of IS) + data */
141: for (i=0; i<nrqs; i++) {
142: j = pa[i];
143: w1[j] += w2[j] + 2*w3[j];
144: msz += w1[j];
145: }
146:
147: /* Determine the number of messages to expect, their lengths, from from-ids */
148: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
149: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
151: /* Now post the Irecvs corresponding to these messages */
152: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
153:
154: /* Allocate Memory for outgoing messages */
155: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
156: PetscMalloc(len,&outdat);
157: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
158: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
159: tmp = (PetscInt*)(outdat + 2*size);
160: ctr = tmp + msz;
162: {
163: PetscInt *iptr = tmp,ict = 0;
164: for (i=0; i<nrqs; i++) {
165: j = pa[i];
166: iptr += ict;
167: outdat[j] = iptr;
168: ict = w1[j];
169: }
170: }
172: /* Form the outgoing messages */
173: /*plug in the headers*/
174: for (i=0; i<nrqs; i++) {
175: j = pa[i];
176: outdat[j][0] = 0;
177: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
178: ptr[j] = outdat[j] + 2*w3[j] + 1;
179: }
180:
181: /* Memory for doing local proc's work*/
182: {
183: PetscInt *d_p;
184: char *t_p;
186: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
187: (Mbs)*imax*sizeof(PetscInt) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
188: PetscMalloc(len,&table);
189: PetscMemzero(table,len);
190: data = (PetscInt **)(table + imax);
191: isz = (PetscInt *)(data + imax);
192: d_p = (PetscInt *)(isz + imax);
193: t_p = (char *)(d_p + Mbs*imax);
194: for (i=0; i<imax; i++) {
195: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
196: data[i] = d_p + (Mbs)*i;
197: }
198: }
200: /* Parse the IS and update local tables and the outgoing buf with the data*/
201: {
202: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
203: PetscBT table_i;
205: for (i=0; i<imax; i++) {
206: PetscMemzero(ctr,size*sizeof(PetscInt));
207: n_i = n[i];
208: table_i = table[i];
209: idx_i = idx[i];
210: data_i = data[i];
211: isz_i = isz[i];
212: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
213: row = idx_i[j];
214: proc = rtable[row];
215: if (proc != rank) { /* copy to the outgoing buffer */
216: ctr[proc]++;
217: *ptr[proc] = row;
218: ptr[proc]++;
219: }
220: else { /* Update the local table */
221: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
222: }
223: }
224: /* Update the headers for the current IS */
225: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
226: if ((ctr_j = ctr[j])) {
227: outdat_j = outdat[j];
228: k = ++outdat_j[0];
229: outdat_j[2*k] = ctr_j;
230: outdat_j[2*k-1] = i;
231: }
232: }
233: isz[i] = isz_i;
234: }
235: }
236:
237: /* Now post the sends */
238: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
239: for (i=0; i<nrqs; ++i) {
240: j = pa[i];
241: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
242: }
243:
244: /* No longer need the original indices*/
245: for (i=0; i<imax; ++i) {
246: ISRestoreIndices(is[i],idx+i);
247: }
248: PetscFree(idx);
250: for (i=0; i<imax; ++i) {
251: ISDestroy(is[i]);
252: }
253:
254: /* Do Local work*/
255: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
257: /* Receive messages*/
258: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
259: MPI_Waitall(nrqr,r_waits1,recv_status);
260:
261: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
262: MPI_Waitall(nrqs,s_waits1,s_status);
264: /* Phase 1 sends are complete - deallocate buffers */
265: PetscFree(outdat);
266: PetscFree(w1);
268: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
269: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
270: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
271: PetscFree(rbuf);
273: /* Send the data back*/
274: /* Do a global reduction to know the buffer space req for incoming messages*/
275: {
276: PetscMPIInt *rw1;
277:
278: PetscMalloc(size*sizeof(PetscInt),&rw1);
279: PetscMemzero(rw1,size*sizeof(PetscInt));
281: for (i=0; i<nrqr; ++i) {
282: proc = recv_status[i].MPI_SOURCE;
283: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
284: rw1[proc] = isz1[i];
285: }
286:
287: PetscFree(onodes1);
288: PetscFree(olengths1);
290: /* Determine the number of messages to expect, their lengths, from from-ids */
291: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
292: PetscFree(rw1);
293: }
294: /* Now post the Irecvs corresponding to these messages */
295: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
296:
297: /* Now post the sends */
298: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
299: for (i=0; i<nrqr; ++i) {
300: j = recv_status[i].MPI_SOURCE;
301: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
302: }
304: /* receive work done on other processors*/
305: {
306: PetscMPIInt idex;
307: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
308: PetscBT table_i;
309: MPI_Status *status2;
310:
311: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
313: for (i=0; i<nrqs; ++i) {
314: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
315: /* Process the message*/
316: rbuf2_i = rbuf2[idex];
317: ct1 = 2*rbuf2_i[0]+1;
318: jmax = rbuf2[idex][0];
319: for (j=1; j<=jmax; j++) {
320: max = rbuf2_i[2*j];
321: is_no = rbuf2_i[2*j-1];
322: isz_i = isz[is_no];
323: data_i = data[is_no];
324: table_i = table[is_no];
325: for (k=0; k<max; k++,ct1++) {
326: row = rbuf2_i[ct1];
327: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
328: }
329: isz[is_no] = isz_i;
330: }
331: }
332: MPI_Waitall(nrqr,s_waits2,status2);
333: PetscFree(status2);
334: }
335:
336: for (i=0; i<imax; ++i) {
337: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
338: }
339:
340:
341: PetscFree(onodes2);
342: PetscFree(olengths2);
344: PetscFree(pa);
345: PetscFree(rbuf2);
346: PetscFree(s_waits1);
347: PetscFree(r_waits1);
348: PetscFree(s_waits2);
349: PetscFree(r_waits2);
350: PetscFree(table);
351: PetscFree(s_status);
352: PetscFree(recv_status);
353: PetscFree(xdata[0]);
354: PetscFree(xdata);
355: PetscFree(isz1);
356: return(0);
357: }
361: /*
362: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
363: the work on the local processor.
365: Inputs:
366: C - MAT_MPIBAIJ;
367: imax - total no of index sets processed at a time;
368: table - an array of char - size = Mbs bits.
369:
370: Output:
371: isz - array containing the count of the solution elements corresponding
372: to each index set;
373: data - pointer to the solutions
374: */
375: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
376: {
377: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
378: Mat A = c->A,B = c->B;
379: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
380: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
381: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
382: PetscBT table_i;
385: rstart = c->rstart;
386: cstart = c->cstart;
387: ai = a->i;
388: aj = a->j;
389: bi = b->i;
390: bj = b->j;
391: garray = c->garray;
393:
394: for (i=0; i<imax; i++) {
395: data_i = data[i];
396: table_i = table[i];
397: isz_i = isz[i];
398: for (j=0,max=isz[i]; j<max; j++) {
399: row = data_i[j] - rstart;
400: start = ai[row];
401: end = ai[row+1];
402: for (k=start; k<end; k++) { /* Amat */
403: val = aj[k] + cstart;
404: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
405: }
406: start = bi[row];
407: end = bi[row+1];
408: for (k=start; k<end; k++) { /* Bmat */
409: val = garray[bj[k]];
410: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
411: }
412: }
413: isz[i] = isz_i;
414: }
415: return(0);
416: }
419: /*
420: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
421: and return the output
423: Input:
424: C - the matrix
425: nrqr - no of messages being processed.
426: rbuf - an array of pointers to the recieved requests
427:
428: Output:
429: xdata - array of messages to be sent back
430: isz1 - size of each message
432: For better efficiency perhaps we should malloc seperately each xdata[i],
433: then if a remalloc is required we need only copy the data for that one row
434: rather than all previous rows as it is now where a single large chunck of
435: memory is used.
437: */
438: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
439: {
440: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
441: Mat A = c->A,B = c->B;
442: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
444: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
445: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
446: PetscInt val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
447: PetscInt *rbuf_i,kmax,rbuf_0;
448: PetscBT xtable;
451: rank = c->rank;
452: Mbs = c->Mbs;
453: rstart = c->rstart;
454: cstart = c->cstart;
455: ai = a->i;
456: aj = a->j;
457: bi = b->i;
458: bj = b->j;
459: garray = c->garray;
460:
461:
462: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
463: rbuf_i = rbuf[i];
464: rbuf_0 = rbuf_i[0];
465: ct += rbuf_0;
466: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
467: }
468:
469: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
470: else max1 = 1;
471: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
472: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
473: ++no_malloc;
474: PetscBTCreate(Mbs,xtable);
475: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
476:
477: ct3 = 0;
478: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
479: rbuf_i = rbuf[i];
480: rbuf_0 = rbuf_i[0];
481: ct1 = 2*rbuf_0+1;
482: ct2 = ct1;
483: ct3 += ct1;
484: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
485: PetscBTMemzero(Mbs,xtable);
486: oct2 = ct2;
487: kmax = rbuf_i[2*j];
488: for (k=0; k<kmax; k++,ct1++) {
489: row = rbuf_i[ct1];
490: if (!PetscBTLookupSet(xtable,row)) {
491: if (!(ct3 < mem_estimate)) {
492: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
493: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
494: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
495: PetscFree(xdata[0]);
496: xdata[0] = tmp;
497: mem_estimate = new_estimate; ++no_malloc;
498: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
499: }
500: xdata[i][ct2++] = row;
501: ct3++;
502: }
503: }
504: for (k=oct2,max2=ct2; k<max2; k++) {
505: row = xdata[i][k] - rstart;
506: start = ai[row];
507: end = ai[row+1];
508: for (l=start; l<end; l++) {
509: val = aj[l] + cstart;
510: if (!PetscBTLookupSet(xtable,val)) {
511: if (!(ct3 < mem_estimate)) {
512: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
513: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
514: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
515: PetscFree(xdata[0]);
516: xdata[0] = tmp;
517: mem_estimate = new_estimate; ++no_malloc;
518: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
519: }
520: xdata[i][ct2++] = val;
521: ct3++;
522: }
523: }
524: start = bi[row];
525: end = bi[row+1];
526: for (l=start; l<end; l++) {
527: val = garray[bj[l]];
528: if (!PetscBTLookupSet(xtable,val)) {
529: if (!(ct3 < mem_estimate)) {
530: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
531: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
532: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
533: PetscFree(xdata[0]);
534: xdata[0] = tmp;
535: mem_estimate = new_estimate; ++no_malloc;
536: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
537: }
538: xdata[i][ct2++] = val;
539: ct3++;
540: }
541: }
542: }
543: /* Update the header*/
544: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
545: xdata[i][2*j-1] = rbuf_i[2*j-1];
546: }
547: xdata[i][0] = rbuf_0;
548: xdata[i+1] = xdata[i] + ct2;
549: isz1[i] = ct2; /* size of each message */
550: }
551: PetscBTDestroy(xtable);
552: PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %D bytes, required %D, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
553: return(0);
554: }
556: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat *);
560: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
561: {
562: IS *isrow_new,*iscol_new;
563: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
565: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->N,bs=C->bs;
568: /* The compression and expansion should be avoided. Does'nt point
569: out errors might change the indices hence buggey */
571: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
572: iscol_new = isrow_new + ismax;
573: ISCompressIndicesSorted(N,bs,ismax,isrow,isrow_new);
574: ISCompressIndicesSorted(N,bs,ismax,iscol,iscol_new);
576: /* Allocate memory to hold all the submatrices */
577: if (scall != MAT_REUSE_MATRIX) {
578: PetscMalloc((ismax+1)*sizeof(Mat),submat);
579: }
580: /* Determine the number of stages through which submatrices are done */
581: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
582: if (!nmax) nmax = 1;
583: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
584:
585: /* Make sure every processor loops through the nstages */
586: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,C->comm);
587: for (i=0,pos=0; i<nstages; i++) {
588: if (pos+nmax <= ismax) max_no = nmax;
589: else if (pos == ismax) max_no = 0;
590: else max_no = ismax-pos;
591: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
592: pos += max_no;
593: }
594:
595: for (i=0; i<ismax; i++) {
596: ISDestroy(isrow_new[i]);
597: ISDestroy(iscol_new[i]);
598: }
599: PetscFree(isrow_new);
600: return(0);
601: }
603: #if defined (PETSC_USE_CTABLE)
606: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
607: {
608: PetscInt nGlobalNd = proc_gnode[size];
609: PetscMPIInt fproc = (PetscMPIInt) ((float)row * (float)size / (float)nGlobalNd + 0.5);
610:
612: if (fproc > size) fproc = size;
613: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
614: if (row < proc_gnode[fproc]) fproc--;
615: else fproc++;
616: }
617: *rank = fproc;
618: return(0);
619: }
620: #endif
622: /* -------------------------------------------------------------------------*/
623: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
626: static PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
627: {
628: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
629: Mat A = c->A;
630: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
631: PetscInt **irow,**icol,*nrow,*ncol,*w3,*w4,start;
633: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
634: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
635: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
636: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
637: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
638: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
639: PetscInt bs=C->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
640: PetscInt cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
641: PetscInt *bmap = c->garray,ctmp,rstart=c->rstart;
642: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
643: MPI_Request *r_waits4,*s_waits3,*s_waits4;
644: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
645: MPI_Status *r_status3,*r_status4,*s_status4;
646: MPI_Comm comm;
647: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
648: MatScalar *a_a=a->a,*b_a=b->a;
649: PetscTruth flag;
650: PetscMPIInt *onodes1,*olengths1;
652: #if defined (PETSC_USE_CTABLE)
653: PetscInt tt;
654: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
655: #else
656: PetscInt **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
657: #endif
660: comm = C->comm;
661: tag0 = C->tag;
662: size = c->size;
663: rank = c->rank;
664:
665: /* Get some new tags to keep the communication clean */
666: PetscObjectGetNewTag((PetscObject)C,&tag1);
667: PetscObjectGetNewTag((PetscObject)C,&tag2);
668: PetscObjectGetNewTag((PetscObject)C,&tag3);
670: /* Check if the col indices are sorted */
671: for (i=0; i<ismax; i++) {
672: ISSorted(iscol[i],(PetscTruth*)&j);
673: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
674: }
676: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
677: #if !defined (PETSC_USE_CTABLE)
678: len += (Mbs+1)*sizeof(PetscInt);
679: #endif
680: PetscMalloc(len,&irow);
681: icol = irow + ismax;
682: nrow = (PetscInt*)(icol + ismax);
683: ncol = nrow + ismax;
684: #if !defined (PETSC_USE_CTABLE)
685: rtable = ncol + ismax;
686: /* Create hash table for the mapping :row -> proc*/
687: for (i=0,j=0; i<size; i++) {
688: jmax = c->rowners[i+1];
689: for (; j<jmax; j++) {
690: rtable[j] = i;
691: }
692: }
693: #endif
694:
695: for (i=0; i<ismax; i++) {
696: ISGetIndices(isrow[i],&irow[i]);
697: ISGetIndices(iscol[i],&icol[i]);
698: ISGetLocalSize(isrow[i],&nrow[i]);
699: ISGetLocalSize(iscol[i],&ncol[i]);
700: }
702: /* evaluate communication - mesg to who,length of mesg,and buffer space
703: required. Based on this, buffers are allocated, and data copied into them*/
704: PetscMalloc(size*2*sizeof(PetscInt) + size*2*sizeof(PetscMPIInt),&w1); /* mesg size */
705: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
706: w3 = (PetscInt*)(w2 + size); /* no of IS that needs to be sent to proc i */
707: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
708: PetscMemzero(w1,size*sizeof(PetscInt)+2*size*sizeof(PetscMPIInt)); /* initialise work vector*/
709: for (i=0; i<ismax; i++) {
710: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
711: jmax = nrow[i];
712: irow_i = irow[i];
713: for (j=0; j<jmax; j++) {
714: row = irow_i[j];
715: #if defined (PETSC_USE_CTABLE)
716: PetscGetProc(row,size,c->rowners,&proc);
717: #else
718: proc = rtable[row];
719: #endif
720: w4[proc]++;
721: }
722: for (j=0; j<size; j++) {
723: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
724: }
725: }
727: nrqs = 0; /* no of outgoing messages */
728: msz = 0; /* total mesg length for all proc */
729: w1[rank] = 0; /* no mesg sent to intself */
730: w3[rank] = 0;
731: for (i=0; i<size; i++) {
732: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
733: }
734: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
735: for (i=0,j=0; i<size; i++) {
736: if (w1[i]) { pa[j] = i; j++; }
737: }
739: /* Each message would have a header = 1 + 2*(no of IS) + data */
740: for (i=0; i<nrqs; i++) {
741: j = pa[i];
742: w1[j] += w2[j] + 2* w3[j];
743: msz += w1[j];
744: }
746: /* Determine the number of messages to expect, their lengths, from from-ids */
747: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
748: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
750: /* Now post the Irecvs corresponding to these messages */
751: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
752:
753: PetscFree(onodes1);
754: PetscFree(olengths1);
756: /* Allocate Memory for outgoing messages */
757: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
758: PetscMalloc(len,&sbuf1);
759: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
760: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
761: /* allocate memory for outgoing data + buf to receive the first reply */
762: tmp = (PetscInt*)(ptr + size);
763: ctr = tmp + 2*msz;
765: {
766: PetscInt *iptr = tmp,ict = 0;
767: for (i=0; i<nrqs; i++) {
768: j = pa[i];
769: iptr += ict;
770: sbuf1[j] = iptr;
771: ict = w1[j];
772: }
773: }
775: /* Form the outgoing messages */
776: /* Initialise the header space */
777: for (i=0; i<nrqs; i++) {
778: j = pa[i];
779: sbuf1[j][0] = 0;
780: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
781: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
782: }
783:
784: /* Parse the isrow and copy data into outbuf */
785: for (i=0; i<ismax; i++) {
786: PetscMemzero(ctr,size*sizeof(PetscInt));
787: irow_i = irow[i];
788: jmax = nrow[i];
789: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
790: row = irow_i[j];
791: #if defined (PETSC_USE_CTABLE)
792: PetscGetProc(row,size,c->rowners,&proc);
793: #else
794: proc = rtable[row];
795: #endif
796: if (proc != rank) { /* copy to the outgoing buf*/
797: ctr[proc]++;
798: *ptr[proc] = row;
799: ptr[proc]++;
800: }
801: }
802: /* Update the headers for the current IS */
803: for (j=0; j<size; j++) { /* Can Optimise this loop too */
804: if ((ctr_j = ctr[j])) {
805: sbuf1_j = sbuf1[j];
806: k = ++sbuf1_j[0];
807: sbuf1_j[2*k] = ctr_j;
808: sbuf1_j[2*k-1] = i;
809: }
810: }
811: }
813: /* Now post the sends */
814: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
815: for (i=0; i<nrqs; ++i) {
816: j = pa[i];
817: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
818: }
820: /* Post Recieves to capture the buffer size */
821: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
822: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
823: rbuf2[0] = tmp + msz;
824: for (i=1; i<nrqs; ++i) {
825: j = pa[i];
826: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
827: }
828: for (i=0; i<nrqs; ++i) {
829: j = pa[i];
830: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
831: }
833: /* Send to other procs the buf size they should allocate */
835: /* Receive messages*/
836: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
837: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
838: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
839: PetscMalloc(len,&sbuf2);
840: req_size = (PetscInt*)(sbuf2 + nrqr);
841: req_source = req_size + nrqr;
842:
843: {
844: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
845: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
847: for (i=0; i<nrqr; ++i) {
848: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
849: req_size[idex] = 0;
850: rbuf1_i = rbuf1[idex];
851: start = 2*rbuf1_i[0] + 1;
852: MPI_Get_count(r_status1+i,MPIU_INT,&end);
853: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
854: sbuf2_i = sbuf2[idex];
855: for (j=start; j<end; j++) {
856: id = rbuf1_i[j] - rstart;
857: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
858: sbuf2_i[j] = ncols;
859: req_size[idex] += ncols;
860: }
861: req_source[idex] = r_status1[i].MPI_SOURCE;
862: /* form the header */
863: sbuf2_i[0] = req_size[idex];
864: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
865: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
866: }
867: }
868: PetscFree(r_status1);
869: PetscFree(r_waits1);
871: /* recv buffer sizes */
872: /* Receive messages*/
874: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
875: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
876: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
877: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
878: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
880: for (i=0; i<nrqs; ++i) {
881: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
882: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
883: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
884: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
885: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
886: }
887: PetscFree(r_status2);
888: PetscFree(r_waits2);
889:
890: /* Wait on sends1 and sends2 */
891: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
892: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
894: MPI_Waitall(nrqs,s_waits1,s_status1);
895: MPI_Waitall(nrqr,s_waits2,s_status2);
896: PetscFree(s_status1);
897: PetscFree(s_status2);
898: PetscFree(s_waits1);
899: PetscFree(s_waits2);
901: /* Now allocate buffers for a->j, and send them off */
902: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
903: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
904: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
905: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
906:
907: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
908: {
909: for (i=0; i<nrqr; i++) {
910: rbuf1_i = rbuf1[i];
911: sbuf_aj_i = sbuf_aj[i];
912: ct1 = 2*rbuf1_i[0] + 1;
913: ct2 = 0;
914: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
915: kmax = rbuf1[i][2*j];
916: for (k=0; k<kmax; k++,ct1++) {
917: row = rbuf1_i[ct1] - rstart;
918: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
919: ncols = nzA + nzB;
920: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
922: /* load the column indices for this row into cols*/
923: cols = sbuf_aj_i + ct2;
924: for (l=0; l<nzB; l++) {
925: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
926: else break;
927: }
928: imark = l;
929: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
930: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
931: ct2 += ncols;
932: }
933: }
934: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
935: }
936: }
937: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
938: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
940: /* Allocate buffers for a->a, and send them off */
941: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
942: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
943: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
944: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
945:
946: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
947: {
948: for (i=0; i<nrqr; i++) {
949: rbuf1_i = rbuf1[i];
950: sbuf_aa_i = sbuf_aa[i];
951: ct1 = 2*rbuf1_i[0]+1;
952: ct2 = 0;
953: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
954: kmax = rbuf1_i[2*j];
955: for (k=0; k<kmax; k++,ct1++) {
956: row = rbuf1_i[ct1] - rstart;
957: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
958: ncols = nzA + nzB;
959: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
960: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
962: /* load the column values for this row into vals*/
963: vals = sbuf_aa_i+ct2*bs2;
964: for (l=0; l<nzB; l++) {
965: if ((bmap[cworkB[l]]) < cstart) {
966: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
967: }
968: else break;
969: }
970: imark = l;
971: for (l=0; l<nzA; l++) {
972: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
973: }
974: for (l=imark; l<nzB; l++) {
975: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
976: }
977: ct2 += ncols;
978: }
979: }
980: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
981: }
982: }
983: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
984: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
985: PetscFree(rbuf1);
987: /* Form the matrix */
988: /* create col map */
989: {
990: PetscInt *icol_i;
991: #if defined (PETSC_USE_CTABLE)
992: /* Create row map*/
993: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
994: for (i=0; i<ismax; i++) {
995: PetscTableCreate(ncol[i]+1,&colmaps[i]);
996: }
997: #else
998: len = (1+ismax)*sizeof(PetscInt*)+ ismax*c->Nbs*sizeof(PetscInt);
999: PetscMalloc(len,&cmap);
1000: cmap[0] = (PetscInt*)(cmap + ismax);
1001: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(PetscInt));
1002: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1003: #endif
1004: for (i=0; i<ismax; i++) {
1005: jmax = ncol[i];
1006: icol_i = icol[i];
1007: #if defined (PETSC_USE_CTABLE)
1008: lcol1_gcol1 = colmaps[i];
1009: for (j=0; j<jmax; j++) {
1010: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1011: }
1012: #else
1013: cmap_i = cmap[i];
1014: for (j=0; j<jmax; j++) {
1015: cmap_i[icol_i[j]] = j+1;
1016: }
1017: #endif
1018: }
1019: }
1021: /* Create lens which is required for MatCreate... */
1022: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1023: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1024: PetscMalloc(len,&lens);
1025: lens[0] = (PetscInt*)(lens + ismax);
1026: PetscMemzero(lens[0],j*sizeof(PetscInt));
1027: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1028:
1029: /* Update lens from local data */
1030: for (i=0; i<ismax; i++) {
1031: jmax = nrow[i];
1032: #if defined (PETSC_USE_CTABLE)
1033: lcol1_gcol1 = colmaps[i];
1034: #else
1035: cmap_i = cmap[i];
1036: #endif
1037: irow_i = irow[i];
1038: lens_i = lens[i];
1039: for (j=0; j<jmax; j++) {
1040: row = irow_i[j];
1041: #if defined (PETSC_USE_CTABLE)
1042: PetscGetProc(row,size,c->rowners,&proc);
1043: #else
1044: proc = rtable[row];
1045: #endif
1046: if (proc == rank) {
1047: /* Get indices from matA and then from matB */
1048: row = row - rstart;
1049: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1050: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1051: #if defined (PETSC_USE_CTABLE)
1052: for (k=0; k<nzA; k++) {
1053: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1054: if (tt) { lens_i[j]++; }
1055: }
1056: for (k=0; k<nzB; k++) {
1057: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1058: if (tt) { lens_i[j]++; }
1059: }
1060: #else
1061: for (k=0; k<nzA; k++) {
1062: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1063: }
1064: for (k=0; k<nzB; k++) {
1065: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1066: }
1067: #endif
1068: }
1069: }
1070: }
1071: #if defined (PETSC_USE_CTABLE)
1072: /* Create row map*/
1073: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1074: for (i=0; i<ismax; i++){
1075: PetscTableCreate(nrow[i]+1,&rowmaps[i]);
1076: }
1077: #else
1078: /* Create row map*/
1079: len = (1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt);
1080: PetscMalloc(len,&rmap);
1081: rmap[0] = (PetscInt*)(rmap + ismax);
1082: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1083: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1084: #endif
1085: for (i=0; i<ismax; i++) {
1086: irow_i = irow[i];
1087: jmax = nrow[i];
1088: #if defined (PETSC_USE_CTABLE)
1089: lrow1_grow1 = rowmaps[i];
1090: for (j=0; j<jmax; j++) {
1091: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1092: }
1093: #else
1094: rmap_i = rmap[i];
1095: for (j=0; j<jmax; j++) {
1096: rmap_i[irow_i[j]] = j;
1097: }
1098: #endif
1099: }
1101: /* Update lens from offproc data */
1102: {
1103: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1104: PetscMPIInt ii;
1106: for (tmp2=0; tmp2<nrqs; tmp2++) {
1107: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1108: idex = pa[ii];
1109: sbuf1_i = sbuf1[idex];
1110: jmax = sbuf1_i[0];
1111: ct1 = 2*jmax+1;
1112: ct2 = 0;
1113: rbuf2_i = rbuf2[ii];
1114: rbuf3_i = rbuf3[ii];
1115: for (j=1; j<=jmax; j++) {
1116: is_no = sbuf1_i[2*j-1];
1117: max1 = sbuf1_i[2*j];
1118: lens_i = lens[is_no];
1119: #if defined (PETSC_USE_CTABLE)
1120: lcol1_gcol1 = colmaps[is_no];
1121: lrow1_grow1 = rowmaps[is_no];
1122: #else
1123: cmap_i = cmap[is_no];
1124: rmap_i = rmap[is_no];
1125: #endif
1126: for (k=0; k<max1; k++,ct1++) {
1127: #if defined (PETSC_USE_CTABLE)
1128: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1129: row--;
1130: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1131: #else
1132: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1133: #endif
1134: max2 = rbuf2_i[ct1];
1135: for (l=0; l<max2; l++,ct2++) {
1136: #if defined (PETSC_USE_CTABLE)
1137: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1138: if (tt) {
1139: lens_i[row]++;
1140: }
1141: #else
1142: if (cmap_i[rbuf3_i[ct2]]) {
1143: lens_i[row]++;
1144: }
1145: #endif
1146: }
1147: }
1148: }
1149: }
1150: }
1151: PetscFree(r_status3);
1152: PetscFree(r_waits3);
1153: MPI_Waitall(nrqr,s_waits3,s_status3);
1154: PetscFree(s_status3);
1155: PetscFree(s_waits3);
1157: /* Create the submatrices */
1158: if (scall == MAT_REUSE_MATRIX) {
1159: /*
1160: Assumes new rows are same length as the old rows, hence bug!
1161: */
1162: for (i=0; i<ismax; i++) {
1163: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1164: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->bs != bs)) {
1165: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1166: }
1167: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1168: if (flag == PETSC_FALSE) {
1169: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1170: }
1171: /* Initial matrix as if empty */
1172: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1173: submats[i]->factor = C->factor;
1174: }
1175: } else {
1176: for (i=0; i<ismax; i++) {
1177: MatCreate(PETSC_COMM_SELF,nrow[i]*bs,ncol[i]*bs,nrow[i]*bs,ncol[i]*bs,submats+i);
1178: MatSetType(submats[i],A->type_name);
1179: MatSeqBAIJSetPreallocation(submats[i],C->bs,0,lens[i]);
1180: #if !defined(PETSC_USE_64BIT_INT)
1181: MatSeqSBAIJSetPreallocation(submats[i],C->bs,0,lens[i]);
1182: #endif
1183: }
1184: }
1186: /* Assemble the matrices */
1187: /* First assemble the local rows */
1188: {
1189: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1190: MatScalar *imat_a;
1191:
1192: for (i=0; i<ismax; i++) {
1193: mat = (Mat_SeqBAIJ*)submats[i]->data;
1194: imat_ilen = mat->ilen;
1195: imat_j = mat->j;
1196: imat_i = mat->i;
1197: imat_a = mat->a;
1199: #if defined (PETSC_USE_CTABLE)
1200: lcol1_gcol1 = colmaps[i];
1201: lrow1_grow1 = rowmaps[i];
1202: #else
1203: cmap_i = cmap[i];
1204: rmap_i = rmap[i];
1205: #endif
1206: irow_i = irow[i];
1207: jmax = nrow[i];
1208: for (j=0; j<jmax; j++) {
1209: row = irow_i[j];
1210: #if defined (PETSC_USE_CTABLE)
1211: PetscGetProc(row,size,c->rowners,&proc);
1212: #else
1213: proc = rtable[row];
1214: #endif
1215: if (proc == rank) {
1216: row = row - rstart;
1217: nzA = a_i[row+1] - a_i[row];
1218: nzB = b_i[row+1] - b_i[row];
1219: cworkA = a_j + a_i[row];
1220: cworkB = b_j + b_i[row];
1221: vworkA = a_a + a_i[row]*bs2;
1222: vworkB = b_a + b_i[row]*bs2;
1223: #if defined (PETSC_USE_CTABLE)
1224: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1225: row--;
1226: if (row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1227: #else
1228: row = rmap_i[row + rstart];
1229: #endif
1230: mat_i = imat_i[row];
1231: mat_a = imat_a + mat_i*bs2;
1232: mat_j = imat_j + mat_i;
1233: ilen_row = imat_ilen[row];
1235: /* load the column indices for this row into cols*/
1236: for (l=0; l<nzB; l++) {
1237: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1238: #if defined (PETSC_USE_CTABLE)
1239: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1240: if (tcol) {
1241: #else
1242: if ((tcol = cmap_i[ctmp])) {
1243: #endif
1244: *mat_j++ = tcol - 1;
1245: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1246: mat_a += bs2;
1247: ilen_row++;
1248: }
1249: } else break;
1250: }
1251: imark = l;
1252: for (l=0; l<nzA; l++) {
1253: #if defined (PETSC_USE_CTABLE)
1254: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1255: if (tcol) {
1256: #else
1257: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1258: #endif
1259: *mat_j++ = tcol - 1;
1260: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1261: mat_a += bs2;
1262: ilen_row++;
1263: }
1264: }
1265: for (l=imark; l<nzB; l++) {
1266: #if defined (PETSC_USE_CTABLE)
1267: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1268: if (tcol) {
1269: #else
1270: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1271: #endif
1272: *mat_j++ = tcol - 1;
1273: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1274: mat_a += bs2;
1275: ilen_row++;
1276: }
1277: }
1278: imat_ilen[row] = ilen_row;
1279: }
1280: }
1281:
1282: }
1283: }
1285: /* Now assemble the off proc rows*/
1286: {
1287: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1288: PetscInt *imat_j,*imat_i;
1289: MatScalar *imat_a,*rbuf4_i;
1290: PetscMPIInt ii;
1292: for (tmp2=0; tmp2<nrqs; tmp2++) {
1293: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1294: idex = pa[ii];
1295: sbuf1_i = sbuf1[idex];
1296: jmax = sbuf1_i[0];
1297: ct1 = 2*jmax + 1;
1298: ct2 = 0;
1299: rbuf2_i = rbuf2[ii];
1300: rbuf3_i = rbuf3[ii];
1301: rbuf4_i = rbuf4[ii];
1302: for (j=1; j<=jmax; j++) {
1303: is_no = sbuf1_i[2*j-1];
1304: #if defined (PETSC_USE_CTABLE)
1305: lrow1_grow1 = rowmaps[is_no];
1306: lcol1_gcol1 = colmaps[is_no];
1307: #else
1308: rmap_i = rmap[is_no];
1309: cmap_i = cmap[is_no];
1310: #endif
1311: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1312: imat_ilen = mat->ilen;
1313: imat_j = mat->j;
1314: imat_i = mat->i;
1315: imat_a = mat->a;
1316: max1 = sbuf1_i[2*j];
1317: for (k=0; k<max1; k++,ct1++) {
1318: row = sbuf1_i[ct1];
1319: #if defined (PETSC_USE_CTABLE)
1320: PetscTableFind(lrow1_grow1,row+1,&row);
1321: row--;
1322: if(row < 0) { SETERRQ(PETSC_ERR_PLIB,"row not found in table"); }
1323: #else
1324: row = rmap_i[row];
1325: #endif
1326: ilen = imat_ilen[row];
1327: mat_i = imat_i[row];
1328: mat_a = imat_a + mat_i*bs2;
1329: mat_j = imat_j + mat_i;
1330: max2 = rbuf2_i[ct1];
1331: for (l=0; l<max2; l++,ct2++) {
1332: #if defined (PETSC_USE_CTABLE)
1333: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1334: if (tcol) {
1335: #else
1336: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1337: #endif
1338: *mat_j++ = tcol - 1;
1339: /* *mat_a++= rbuf4_i[ct2]; */
1340: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1341: mat_a += bs2;
1342: ilen++;
1343: }
1344: }
1345: imat_ilen[row] = ilen;
1346: }
1347: }
1348: }
1349: }
1350: PetscFree(r_status4);
1351: PetscFree(r_waits4);
1352: MPI_Waitall(nrqr,s_waits4,s_status4);
1353: PetscFree(s_waits4);
1354: PetscFree(s_status4);
1356: /* Restore the indices */
1357: for (i=0; i<ismax; i++) {
1358: ISRestoreIndices(isrow[i],irow+i);
1359: ISRestoreIndices(iscol[i],icol+i);
1360: }
1362: /* Destroy allocated memory */
1363: PetscFree(irow);
1364: PetscFree(w1);
1365: PetscFree(pa);
1367: PetscFree(sbuf1);
1368: PetscFree(rbuf2);
1369: for (i=0; i<nrqr; ++i) {
1370: PetscFree(sbuf2[i]);
1371: }
1372: for (i=0; i<nrqs; ++i) {
1373: PetscFree(rbuf3[i]);
1374: PetscFree(rbuf4[i]);
1375: }
1377: PetscFree(sbuf2);
1378: PetscFree(rbuf3);
1379: PetscFree(rbuf4);
1380: PetscFree(sbuf_aj[0]);
1381: PetscFree(sbuf_aj);
1382: PetscFree(sbuf_aa[0]);
1383: PetscFree(sbuf_aa);
1385: #if defined (PETSC_USE_CTABLE)
1386: for (i=0; i<ismax; i++){
1387: PetscTableDelete(rowmaps[i]);
1388: PetscTableDelete(colmaps[i]);
1389: }
1390: PetscFree(colmaps);
1391: PetscFree(rowmaps);
1392: #else
1393: PetscFree(rmap);
1394: PetscFree(cmap);
1395: #endif
1396: PetscFree(lens);
1398: for (i=0; i<ismax; i++) {
1399: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1400: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1401: }
1403: return(0);
1404: }