Actual source code: baijov.c
1: /*$Id: baijov.c,v 1.65 2001/08/06 21:15:42 bsmith Exp $*/
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/baij/mpi/mpibaij.h
8: #include petscbt.h
10: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat,int,IS *);
11: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat,int,char **,int*,int**);
12: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat,int,int **,int**,int*);
13: EXTERN int MatGetRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
14: EXTERN int MatRestoreRow_MPIBAIJ(Mat,int,int*,int**,PetscScalar**);
15:
18: static int MatCompressIndicesGeneral_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[])
19: {
20: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
21: int ierr,isz,bs = baij->bs,n,i,j,*idx,ival;
22: #if defined (PETSC_USE_CTABLE)
23: PetscTable gid1_lid1;
24: int tt, gid1, *nidx;
25: PetscTablePosition tpos;
26: #else
27: int Nbs,*nidx;
28: PetscBT table;
29: #endif
32: #if defined (PETSC_USE_CTABLE)
33: PetscTableCreate(baij->mbs,&gid1_lid1);
34: #else
35: Nbs = baij->Nbs;
36: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
37: PetscBTCreate(Nbs,table);
38: #endif
39: for (i=0; i<imax; i++) {
40: isz = 0;
41: #if defined (PETSC_USE_CTABLE)
42: PetscTableRemoveAll(gid1_lid1);
43: #else
44: PetscBTMemzero(Nbs,table);
45: #endif
46: ISGetIndices(is_in[i],&idx);
47: ISGetLocalSize(is_in[i],&n);
48: for (j=0; j<n ; j++) {
49: ival = idx[j]/bs; /* convert the indices into block indices */
50: #if defined (PETSC_USE_CTABLE)
51: PetscTableFind(gid1_lid1,ival+1,&tt);
52: if (!tt) {
53: PetscTableAdd(gid1_lid1,ival+1,isz+1);
54: isz++;
55: }
56: #else
57: if (ival>Nbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
58: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
59: #endif
60: }
61: ISRestoreIndices(is_in[i],&idx);
62: #if defined (PETSC_USE_CTABLE)
63: PetscMalloc((isz+1)*sizeof(int),&nidx);
64: PetscTableGetHeadPosition(gid1_lid1,&tpos);
65: j = 0;
66: while (tpos) {
67: PetscTableGetNext(gid1_lid1,&tpos,&gid1,&tt);
68: if (tt-- > isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than array-dim"); }
69: nidx[tt] = gid1 - 1;
70: j++;
71: }
72: if (j != isz) { SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"table error: jj != isz"); }
73: ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));
74: PetscFree(nidx);
75: #else
76: ISCreateGeneral(PETSC_COMM_SELF,isz,nidx,(is_out+i));
77: #endif
78: }
79: #if defined (PETSC_USE_CTABLE)
80: PetscTableDelete(gid1_lid1);
81: #else
82: PetscBTDestroy(table);
83: PetscFree(nidx);
84: #endif
85: return(0);
86: }
90: static int MatCompressIndicesSorted_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[])
91: {
92: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
93: int ierr,bs=baij->bs,i,j,k,val,n,*idx,*nidx,*idx_local;
94: PetscTruth flg;
95: #if defined (PETSC_USE_CTABLE)
96: int maxsz;
97: #else
98: int Nbs=baij->Nbs;
99: #endif
101: for (i=0; i<imax; i++) {
102: ISSorted(is_in[i],&flg);
103: if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Indices are not sorted");
104: }
105: #if defined (PETSC_USE_CTABLE)
106: /* Now check max size */
107: for (i=0,maxsz=0; i<imax; i++) {
108: ISGetIndices(is_in[i],&idx);
109: ISGetLocalSize(is_in[i],&n);
110: if (n%bs !=0) SETERRQ(1,"Indices are not block ordered");
111: n = n/bs; /* The reduced index size */
112: if (n > maxsz) maxsz = n;
113: }
114: PetscMalloc((maxsz+1)*sizeof(int),&nidx);
115: #else
116: PetscMalloc((Nbs+1)*sizeof(int),&nidx);
117: #endif
118: /* Now check if the indices are in block order */
119: for (i=0; i<imax; i++) {
120: ISGetIndices(is_in[i],&idx);
121: ISGetLocalSize(is_in[i],&n);
122: if (n%bs !=0) SETERRQ(1,"Indices are not block ordered");
124: n = n/bs; /* The reduced index size */
125: idx_local = idx;
126: for (j=0; j<n ; j++) {
127: val = idx_local[0];
128: if (val%bs != 0) SETERRQ(1,"Indices are not block ordered");
129: for (k=0; k<bs; k++) {
130: if (val+k != idx_local[k]) SETERRQ(1,"Indices are not block ordered");
131: }
132: nidx[j] = val/bs;
133: idx_local +=bs;
134: }
135: ISRestoreIndices(is_in[i],&idx);
136: ISCreateGeneral(PETSC_COMM_SELF,n,nidx,(is_out+i));
137: }
138: PetscFree(nidx);
140: return(0);
141: }
145: static int MatExpandIndices_MPIBAIJ(Mat C,int imax,const IS is_in[],IS is_out[])
146: {
147: Mat_MPIBAIJ *baij = (Mat_MPIBAIJ*)C->data;
148: int ierr,bs = baij->bs,n,i,j,k,*idx,*nidx;
149: #if defined (PETSC_USE_CTABLE)
150: int maxsz;
151: #else
152: int Nbs = baij->Nbs;
153: #endif
156:
157: #if defined (PETSC_USE_CTABLE)
158: /* Now check max size */
159: for (i=0,maxsz=0; i<imax; i++) {
160: ISGetIndices(is_in[i],&idx);
161: ISGetLocalSize(is_in[i],&n);
162: if (n*bs > maxsz) maxsz = n*bs;
163: }
164: PetscMalloc((maxsz+1)*sizeof(int),&nidx);
165: #else
166: PetscMalloc((Nbs*bs+1)*sizeof(int),&nidx);
167: #endif
169: for (i=0; i<imax; i++) {
170: ISGetIndices(is_in[i],&idx);
171: ISGetLocalSize(is_in[i],&n);
172: for (j=0; j<n ; ++j){
173: for (k=0; k<bs; k++)
174: nidx[j*bs+k] = idx[j]*bs+k;
175: }
176: ISRestoreIndices(is_in[i],&idx);
177: ISCreateGeneral(PETSC_COMM_SELF,n*bs,nidx,is_out+i);
178: }
179: PetscFree(nidx);
180: return(0);
181: }
186: int MatIncreaseOverlap_MPIBAIJ(Mat C,int imax,IS is[],int ov)
187: {
188: int i,ierr;
189: IS *is_new;
192: PetscMalloc(imax*sizeof(IS),&is_new);
193: /* Convert the indices into block format */
194: MatCompressIndicesGeneral_MPIBAIJ(C,imax,is,is_new);
195: if (ov < 0){ SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");}
196: for (i=0; i<ov; ++i) {
197: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
198: }
199: for (i=0; i<imax; i++) {ISDestroy(is[i]);}
200: MatExpandIndices_MPIBAIJ(C,imax,is_new,is);
201: for (i=0; i<imax; i++) {ISDestroy(is_new[i]);}
202: PetscFree(is_new);
203: return(0);
204: }
206: /*
207: Sample message format:
208: If a processor A wants processor B to process some elements corresponding
209: to index sets 1s[1], is[5]
210: mesg [0] = 2 (no of index sets in the mesg)
211: -----------
212: mesg [1] = 1 => is[1]
213: mesg [2] = sizeof(is[1]);
214: -----------
215: mesg [5] = 5 => is[5]
216: mesg [6] = sizeof(is[5]);
217: -----------
218: mesg [7]
219: mesg [n] datas[1]
220: -----------
221: mesg[n+1]
222: mesg[m] data(is[5])
223: -----------
224:
225: Notes:
226: nrqs - no of requests sent (or to be sent out)
227: nrqr - no of requests recieved (which have to be or which have been processed
228: */
231: static int MatIncreaseOverlap_MPIBAIJ_Once(Mat C,int imax,IS is[])
232: {
233: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
234: int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
235: int size,rank,Mbs,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
236: int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
237: int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
238: PetscBT *table;
239: MPI_Comm comm;
240: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
241: MPI_Status *s_status,*recv_status;
244: comm = C->comm;
245: size = c->size;
246: rank = c->rank;
247: Mbs = c->Mbs;
249: PetscObjectGetNewTag((PetscObject)C,&tag1);
250: PetscObjectGetNewTag((PetscObject)C,&tag2);
252: len = (imax+1)*sizeof(int*)+ (imax + Mbs)*sizeof(int);
253: PetscMalloc(len,&idx);
254: n = (int*)(idx + imax);
255: rtable = n + imax;
256:
257: for (i=0; i<imax; i++) {
258: ISGetIndices(is[i],&idx[i]);
259: ISGetLocalSize(is[i],&n[i]);
260: }
261:
262: /* Create hash table for the mapping :row -> proc*/
263: for (i=0,j=0; i<size; i++) {
264: len = c->rowners[i+1];
265: for (; j<len; j++) {
266: rtable[j] = i;
267: }
268: }
270: /* evaluate communication - mesg to who,length of mesg, and buffer space
271: required. Based on this, buffers are allocated, and data copied into them*/
272: PetscMalloc(size*4*sizeof(int),&w1);/* mesg size */
273: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
274: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
275: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
276: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
277: for (i=0; i<imax; i++) {
278: PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
279: idx_i = idx[i];
280: len = n[i];
281: for (j=0; j<len; j++) {
282: row = idx_i[j];
283: if (row < 0) {
284: SETERRQ(1,"Index set cannot have negative entries");
285: }
286: proc = rtable[row];
287: w4[proc]++;
288: }
289: for (j=0; j<size; j++){
290: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
291: }
292: }
294: nrqs = 0; /* no of outgoing messages */
295: msz = 0; /* total mesg length (for all proc */
296: w1[rank] = 0; /* no mesg sent to intself */
297: w3[rank] = 0;
298: for (i=0; i<size; i++) {
299: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
300: }
301: /* pa - is list of processors to communicate with */
302: PetscMalloc((nrqs+1)*sizeof(int),&pa);
303: for (i=0,j=0; i<size; i++) {
304: if (w1[i]) {pa[j] = i; j++;}
305: }
307: /* Each message would have a header = 1 + 2*(no of IS) + data */
308: for (i=0; i<nrqs; i++) {
309: j = pa[i];
310: w1[j] += w2[j] + 2*w3[j];
311: msz += w1[j];
312: }
313:
314: /* Determine the number of messages to expect, their lengths, from from-ids */
315: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
316: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
318: /* Now post the Irecvs corresponding to these messages */
319: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
320:
321: /* Allocate Memory for outgoing messages */
322: len = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
323: PetscMalloc(len,&outdat);
324: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
325: PetscMemzero(outdat,2*size*sizeof(int*));
326: tmp = (int*)(outdat + 2*size);
327: ctr = tmp + msz;
329: {
330: int *iptr = tmp,ict = 0;
331: for (i=0; i<nrqs; i++) {
332: j = pa[i];
333: iptr += ict;
334: outdat[j] = iptr;
335: ict = w1[j];
336: }
337: }
339: /* Form the outgoing messages */
340: /*plug in the headers*/
341: for (i=0; i<nrqs; i++) {
342: j = pa[i];
343: outdat[j][0] = 0;
344: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));
345: ptr[j] = outdat[j] + 2*w3[j] + 1;
346: }
347:
348: /* Memory for doing local proc's work*/
349: {
350: int *d_p;
351: char *t_p;
353: len = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
354: (Mbs)*imax*sizeof(int) + (Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
355: PetscMalloc(len,&table);
356: PetscMemzero(table,len);
357: data = (int **)(table + imax);
358: isz = (int *)(data + imax);
359: d_p = (int *)(isz + imax);
360: t_p = (char *)(d_p + Mbs*imax);
361: for (i=0; i<imax; i++) {
362: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
363: data[i] = d_p + (Mbs)*i;
364: }
365: }
367: /* Parse the IS and update local tables and the outgoing buf with the data*/
368: {
369: int n_i,*data_i,isz_i,*outdat_j,ctr_j;
370: PetscBT table_i;
372: for (i=0; i<imax; i++) {
373: PetscMemzero(ctr,size*sizeof(int));
374: n_i = n[i];
375: table_i = table[i];
376: idx_i = idx[i];
377: data_i = data[i];
378: isz_i = isz[i];
379: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
380: row = idx_i[j];
381: proc = rtable[row];
382: if (proc != rank) { /* copy to the outgoing buffer */
383: ctr[proc]++;
384: *ptr[proc] = row;
385: ptr[proc]++;
386: }
387: else { /* Update the local table */
388: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
389: }
390: }
391: /* Update the headers for the current IS */
392: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
393: if ((ctr_j = ctr[j])) {
394: outdat_j = outdat[j];
395: k = ++outdat_j[0];
396: outdat_j[2*k] = ctr_j;
397: outdat_j[2*k-1] = i;
398: }
399: }
400: isz[i] = isz_i;
401: }
402: }
403:
404: /* Now post the sends */
405: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
406: for (i=0; i<nrqs; ++i) {
407: j = pa[i];
408: MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);
409: }
410:
411: /* No longer need the original indices*/
412: for (i=0; i<imax; ++i) {
413: ISRestoreIndices(is[i],idx+i);
414: }
415: PetscFree(idx);
417: for (i=0; i<imax; ++i) {
418: ISDestroy(is[i]);
419: }
420:
421: /* Do Local work*/
422: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
424: /* Receive messages*/
425: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
426: MPI_Waitall(nrqr,r_waits1,recv_status);
427:
428: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
429: MPI_Waitall(nrqs,s_waits1,s_status);
431: /* Phase 1 sends are complete - deallocate buffers */
432: PetscFree(outdat);
433: PetscFree(w1);
435: PetscMalloc((nrqr+1)*sizeof(int *),&xdata);
436: PetscMalloc((nrqr+1)*sizeof(int),&isz1);
437: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
438: PetscFree(rbuf);
440: /* Send the data back*/
441: /* Do a global reduction to know the buffer space req for incoming messages*/
442: {
443: int *rw1;
444:
445: PetscMalloc(size*sizeof(int),&rw1);
446: PetscMemzero(rw1,size*sizeof(int));
448: for (i=0; i<nrqr; ++i) {
449: proc = recv_status[i].MPI_SOURCE;
450: if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
451: rw1[proc] = isz1[i];
452: }
453:
454: PetscFree(onodes1);
455: PetscFree(olengths1);
457: /* Determine the number of messages to expect, their lengths, from from-ids */
458: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
459: PetscFree(rw1);
460: }
461: /* Now post the Irecvs corresponding to these messages */
462: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
463:
464: /* Now post the sends */
465: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
466: for (i=0; i<nrqr; ++i) {
467: j = recv_status[i].MPI_SOURCE;
468: MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);
469: }
471: /* receive work done on other processors*/
472: {
473: int idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
474: PetscBT table_i;
475: MPI_Status *status2;
476:
477: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
479: for (i=0; i<nrqs; ++i) {
480: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
481: /* Process the message*/
482: rbuf2_i = rbuf2[idex];
483: ct1 = 2*rbuf2_i[0]+1;
484: jmax = rbuf2[idex][0];
485: for (j=1; j<=jmax; j++) {
486: max = rbuf2_i[2*j];
487: is_no = rbuf2_i[2*j-1];
488: isz_i = isz[is_no];
489: data_i = data[is_no];
490: table_i = table[is_no];
491: for (k=0; k<max; k++,ct1++) {
492: row = rbuf2_i[ct1];
493: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
494: }
495: isz[is_no] = isz_i;
496: }
497: }
498: MPI_Waitall(nrqr,s_waits2,status2);
499: PetscFree(status2);
500: }
501:
502: for (i=0; i<imax; ++i) {
503: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
504: }
505:
506:
507: PetscFree(onodes2);
508: PetscFree(olengths2);
510: PetscFree(pa);
511: PetscFree(rbuf2);
512: PetscFree(s_waits1);
513: PetscFree(r_waits1);
514: PetscFree(s_waits2);
515: PetscFree(r_waits2);
516: PetscFree(table);
517: PetscFree(s_status);
518: PetscFree(recv_status);
519: PetscFree(xdata[0]);
520: PetscFree(xdata);
521: PetscFree(isz1);
522: return(0);
523: }
527: /*
528: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
529: the work on the local processor.
531: Inputs:
532: C - MAT_MPIBAIJ;
533: imax - total no of index sets processed at a time;
534: table - an array of char - size = Mbs bits.
535:
536: Output:
537: isz - array containing the count of the solution elements correspondign
538: to each index set;
539: data - pointer to the solutions
540: */
541: static int MatIncreaseOverlap_MPIBAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
542: {
543: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
544: Mat A = c->A,B = c->B;
545: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
546: int start,end,val,max,rstart,cstart,*ai,*aj;
547: int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
548: PetscBT table_i;
551: rstart = c->rstart;
552: cstart = c->cstart;
553: ai = a->i;
554: aj = a->j;
555: bi = b->i;
556: bj = b->j;
557: garray = c->garray;
559:
560: for (i=0; i<imax; i++) {
561: data_i = data[i];
562: table_i = table[i];
563: isz_i = isz[i];
564: for (j=0,max=isz[i]; j<max; j++) {
565: row = data_i[j] - rstart;
566: start = ai[row];
567: end = ai[row+1];
568: for (k=start; k<end; k++) { /* Amat */
569: val = aj[k] + cstart;
570: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
571: }
572: start = bi[row];
573: end = bi[row+1];
574: for (k=start; k<end; k++) { /* Bmat */
575: val = garray[bj[k]];
576: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
577: }
578: }
579: isz[i] = isz_i;
580: }
581: return(0);
582: }
585: /*
586: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
587: and return the output
589: Input:
590: C - the matrix
591: nrqr - no of messages being processed.
592: rbuf - an array of pointers to the recieved requests
593:
594: Output:
595: xdata - array of messages to be sent back
596: isz1 - size of each message
598: For better efficiency perhaps we should malloc seperately each xdata[i],
599: then if a remalloc is required we need only copy the data for that one row
600: rather then all previous rows as it is now where a single large chunck of
601: memory is used.
603: */
604: static int MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
605: {
606: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
607: Mat A = c->A,B = c->B;
608: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
609: int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
610: int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
611: int val,max1,max2,rank,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
612: int *rbuf_i,kmax,rbuf_0,ierr;
613: PetscBT xtable;
616: rank = c->rank;
617: Mbs = c->Mbs;
618: rstart = c->rstart;
619: cstart = c->cstart;
620: ai = a->i;
621: aj = a->j;
622: bi = b->i;
623: bj = b->j;
624: garray = c->garray;
625:
626:
627: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
628: rbuf_i = rbuf[i];
629: rbuf_0 = rbuf_i[0];
630: ct += rbuf_0;
631: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
632: }
633:
634: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
635: else max1 = 1;
636: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
637: PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);
638: ++no_malloc;
639: PetscBTCreate(Mbs,xtable);
640: PetscMemzero(isz1,nrqr*sizeof(int));
641:
642: ct3 = 0;
643: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
644: rbuf_i = rbuf[i];
645: rbuf_0 = rbuf_i[0];
646: ct1 = 2*rbuf_0+1;
647: ct2 = ct1;
648: ct3 += ct1;
649: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
650: PetscBTMemzero(Mbs,xtable);
651: oct2 = ct2;
652: kmax = rbuf_i[2*j];
653: for (k=0; k<kmax; k++,ct1++) {
654: row = rbuf_i[ct1];
655: if (!PetscBTLookupSet(xtable,row)) {
656: if (!(ct3 < mem_estimate)) {
657: new_estimate = (int)(1.5*mem_estimate)+1;
658: PetscMalloc(new_estimate * sizeof(int),&tmp);
659: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
660: PetscFree(xdata[0]);
661: xdata[0] = tmp;
662: mem_estimate = new_estimate; ++no_malloc;
663: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
664: }
665: xdata[i][ct2++] = row;
666: ct3++;
667: }
668: }
669: for (k=oct2,max2=ct2; k<max2; k++) {
670: row = xdata[i][k] - rstart;
671: start = ai[row];
672: end = ai[row+1];
673: for (l=start; l<end; l++) {
674: val = aj[l] + cstart;
675: if (!PetscBTLookupSet(xtable,val)) {
676: if (!(ct3 < mem_estimate)) {
677: new_estimate = (int)(1.5*mem_estimate)+1;
678: PetscMalloc(new_estimate * sizeof(int),&tmp);
679: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
680: PetscFree(xdata[0]);
681: xdata[0] = tmp;
682: mem_estimate = new_estimate; ++no_malloc;
683: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
684: }
685: xdata[i][ct2++] = val;
686: ct3++;
687: }
688: }
689: start = bi[row];
690: end = bi[row+1];
691: for (l=start; l<end; l++) {
692: val = garray[bj[l]];
693: if (!PetscBTLookupSet(xtable,val)) {
694: if (!(ct3 < mem_estimate)) {
695: new_estimate = (int)(1.5*mem_estimate)+1;
696: PetscMalloc(new_estimate * sizeof(int),&tmp);
697: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));
698: PetscFree(xdata[0]);
699: xdata[0] = tmp;
700: mem_estimate = new_estimate; ++no_malloc;
701: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
702: }
703: xdata[i][ct2++] = val;
704: ct3++;
705: }
706: }
707: }
708: /* Update the header*/
709: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
710: xdata[i][2*j-1] = rbuf_i[2*j-1];
711: }
712: xdata[i][0] = rbuf_0;
713: xdata[i+1] = xdata[i] + ct2;
714: isz1[i] = ct2; /* size of each message */
715: }
716: PetscBTDestroy(xtable);
717: PetscLogInfo(0,"MatIncreaseOverlap_MPIBAIJ:[%d] Allocated %d bytes, required %d, no of mallocs = %d\n",rank,mem_estimate,ct3,no_malloc);
718: return(0);
719: }
721: static int MatGetSubMatrices_MPIBAIJ_local(Mat,int,const IS[],const IS[],MatReuse,Mat *);
725: int MatGetSubMatrices_MPIBAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
726: {
727: IS *isrow_new,*iscol_new;
728: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
729: int nmax,nstages_local,nstages,i,pos,max_no,ierr;
732: /* The compression and expansion should be avoided. Does'nt point
733: out errors might change the indices hence buggey */
735: PetscMalloc(2*(ismax+1)*sizeof(IS),&isrow_new);
736: iscol_new = isrow_new + ismax;
737: MatCompressIndicesSorted_MPIBAIJ(C,ismax,isrow,isrow_new);
738: MatCompressIndicesSorted_MPIBAIJ(C,ismax,iscol,iscol_new);
740: /* Allocate memory to hold all the submatrices */
741: if (scall != MAT_REUSE_MATRIX) {
742: PetscMalloc((ismax+1)*sizeof(Mat),submat);
743: }
744: /* Determine the number of stages through which submatrices are done */
745: nmax = 20*1000000 / (c->Nbs * sizeof(int));
746: if (!nmax) nmax = 1;
747: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
748:
749: /* Make sure every porcessor loops through the nstages */
750: MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);
751: for (i=0,pos=0; i<nstages; i++) {
752: if (pos+nmax <= ismax) max_no = nmax;
753: else if (pos == ismax) max_no = 0;
754: else max_no = ismax-pos;
755: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,*submat+pos);
756: pos += max_no;
757: }
758:
759: for (i=0; i<ismax; i++) {
760: ISDestroy(isrow_new[i]);
761: ISDestroy(iscol_new[i]);
762: }
763: PetscFree(isrow_new);
764: return(0);
765: }
767: #if defined (PETSC_USE_CTABLE)
770: int PetscGetProc(const int gid, const int numprocs, const int proc_gnode[], int *proc)
771: {
772: int nGlobalNd = proc_gnode[numprocs];
773: int fproc = (int) ((float)gid * (float)numprocs / (float)nGlobalNd + 0.5);
774:
776: /* if(fproc < 0) SETERRQ(1,"fproc < 0");*/
777: if (fproc > numprocs) fproc = numprocs;
778: while (gid < proc_gnode[fproc] || gid >= proc_gnode[fproc+1]) {
779: if (gid < proc_gnode[fproc]) fproc--;
780: else fproc++;
781: }
782: /* if(fproc<0 || fproc>=numprocs) { SETERRQ(1,"fproc < 0 || fproc >= numprocs"); }*/
783: *proc = fproc;
784: return(0);
785: }
786: #endif
788: /* -------------------------------------------------------------------------*/
791: static int MatGetSubMatrices_MPIBAIJ_local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
792: {
793: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
794: Mat A = c->A;
795: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
796: int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,start,end,size;
797: int **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
798: int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
799: int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
800: int **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
801: int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
802: int bs=c->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
803: int cstart = c->cstart,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
804: int *bmap = c->garray,ctmp,rstart=c->rstart,tag0,tag1,tag2,tag3;
805: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
806: MPI_Request *r_waits4,*s_waits3,*s_waits4;
807: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
808: MPI_Status *r_status3,*r_status4,*s_status4;
809: MPI_Comm comm;
810: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i,*vworkA,*vworkB;
811: MatScalar *a_a=a->a,*b_a=b->a;
812: PetscTruth flag;
813: int *onodes1,*olengths1;
815: #if defined (PETSC_USE_CTABLE)
816: int tt;
817: PetscTable *rowmaps,*colmaps,lrow1_grow1,lcol1_gcol1;
818: #else
819: int **cmap,*cmap_i,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
820: #endif
823: comm = C->comm;
824: tag0 = C->tag;
825: size = c->size;
826: rank = c->rank;
827:
828: /* Get some new tags to keep the communication clean */
829: PetscObjectGetNewTag((PetscObject)C,&tag1);
830: PetscObjectGetNewTag((PetscObject)C,&tag2);
831: PetscObjectGetNewTag((PetscObject)C,&tag3);
833: /* Check if the col indices are sorted */
834: for (i=0; i<ismax; i++) {
835: ISSorted(iscol[i],(PetscTruth*)&j);
836: if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
837: }
839: len = (2*ismax+1)*(sizeof(int*)+ sizeof(int));
840: #if !defined (PETSC_USE_CTABLE)
841: len += (Mbs+1)*sizeof(int);
842: #endif
843: PetscMalloc(len,&irow);
844: icol = irow + ismax;
845: nrow = (int*)(icol + ismax);
846: ncol = nrow + ismax;
847: #if !defined (PETSC_USE_CTABLE)
848: rtable = ncol + ismax;
849: /* Create hash table for the mapping :row -> proc*/
850: for (i=0,j=0; i<size; i++) {
851: jmax = c->rowners[i+1];
852: for (; j<jmax; j++) {
853: rtable[j] = i;
854: }
855: }
856: #endif
857:
858: for (i=0; i<ismax; i++) {
859: ISGetIndices(isrow[i],&irow[i]);
860: ISGetIndices(iscol[i],&icol[i]);
861: ISGetLocalSize(isrow[i],&nrow[i]);
862: ISGetLocalSize(iscol[i],&ncol[i]);
863: }
865: /* evaluate communication - mesg to who,length of mesg,and buffer space
866: required. Based on this, buffers are allocated, and data copied into them*/
867: PetscMalloc(size*4*sizeof(int),&w1); /* mesg size */
868: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
869: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
870: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
871: PetscMemzero(w1,size*3*sizeof(int)); /* initialise work vector*/
872: for (i=0; i<ismax; i++) {
873: PetscMemzero(w4,size*sizeof(int)); /* initialise work vector*/
874: jmax = nrow[i];
875: irow_i = irow[i];
876: for (j=0; j<jmax; j++) {
877: row = irow_i[j];
878: #if defined (PETSC_USE_CTABLE)
879: PetscGetProc(row,size,c->rowners,&proc);
880: #else
881: proc = rtable[row];
882: #endif
883: w4[proc]++;
884: }
885: for (j=0; j<size; j++) {
886: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
887: }
888: }
890: nrqs = 0; /* no of outgoing messages */
891: msz = 0; /* total mesg length for all proc */
892: w1[rank] = 0; /* no mesg sent to intself */
893: w3[rank] = 0;
894: for (i=0; i<size; i++) {
895: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
896: }
897: PetscMalloc((nrqs+1)*sizeof(int),&pa); /*(proc -array)*/
898: for (i=0,j=0; i<size; i++) {
899: if (w1[i]) { pa[j] = i; j++; }
900: }
902: /* Each message would have a header = 1 + 2*(no of IS) + data */
903: for (i=0; i<nrqs; i++) {
904: j = pa[i];
905: w1[j] += w2[j] + 2* w3[j];
906: msz += w1[j];
907: }
909: /* Determine the number of messages to expect, their lengths, from from-ids */
910: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
911: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
913: /* Now post the Irecvs corresponding to these messages */
914: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
915:
916: PetscFree(onodes1);
917: PetscFree(olengths1);
919: /* Allocate Memory for outgoing messages */
920: len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
921: PetscMalloc(len,&sbuf1);
922: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
923: PetscMemzero(sbuf1,2*size*sizeof(int*));
924: /* allocate memory for outgoing data + buf to receive the first reply */
925: tmp = (int*)(ptr + size);
926: ctr = tmp + 2*msz;
928: {
929: int *iptr = tmp,ict = 0;
930: for (i=0; i<nrqs; i++) {
931: j = pa[i];
932: iptr += ict;
933: sbuf1[j] = iptr;
934: ict = w1[j];
935: }
936: }
938: /* Form the outgoing messages */
939: /* Initialise the header space */
940: for (i=0; i<nrqs; i++) {
941: j = pa[i];
942: sbuf1[j][0] = 0;
943: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));
944: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
945: }
946:
947: /* Parse the isrow and copy data into outbuf */
948: for (i=0; i<ismax; i++) {
949: PetscMemzero(ctr,size*sizeof(int));
950: irow_i = irow[i];
951: jmax = nrow[i];
952: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
953: row = irow_i[j];
954: #if defined (PETSC_USE_CTABLE)
955: PetscGetProc(row,size,c->rowners,&proc);
956: #else
957: proc = rtable[row];
958: #endif
959: if (proc != rank) { /* copy to the outgoing buf*/
960: ctr[proc]++;
961: *ptr[proc] = row;
962: ptr[proc]++;
963: }
964: }
965: /* Update the headers for the current IS */
966: for (j=0; j<size; j++) { /* Can Optimise this loop too */
967: if ((ctr_j = ctr[j])) {
968: sbuf1_j = sbuf1[j];
969: k = ++sbuf1_j[0];
970: sbuf1_j[2*k] = ctr_j;
971: sbuf1_j[2*k-1] = i;
972: }
973: }
974: }
976: /* Now post the sends */
977: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
978: for (i=0; i<nrqs; ++i) {
979: j = pa[i];
980: MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);
981: }
983: /* Post Recieves to capture the buffer size */
984: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
985: PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);
986: rbuf2[0] = tmp + msz;
987: for (i=1; i<nrqs; ++i) {
988: j = pa[i];
989: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
990: }
991: for (i=0; i<nrqs; ++i) {
992: j = pa[i];
993: MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);
994: }
996: /* Send to other procs the buf size they should allocate */
998: /* Receive messages*/
999: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
1000: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
1001: len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
1002: PetscMalloc(len,&sbuf2);
1003: req_size = (int*)(sbuf2 + nrqr);
1004: req_source = req_size + nrqr;
1005:
1006: {
1007: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
1008: int *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
1010: for (i=0; i<nrqr; ++i) {
1011: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
1012: req_size[idex] = 0;
1013: rbuf1_i = rbuf1[idex];
1014: start = 2*rbuf1_i[0] + 1;
1015: MPI_Get_count(r_status1+i,MPI_INT,&end);
1016: PetscMalloc(end*sizeof(int),&sbuf2[idex]);
1017: sbuf2_i = sbuf2[idex];
1018: for (j=start; j<end; j++) {
1019: id = rbuf1_i[j] - rstart;
1020: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
1021: sbuf2_i[j] = ncols;
1022: req_size[idex] += ncols;
1023: }
1024: req_source[idex] = r_status1[i].MPI_SOURCE;
1025: /* form the header */
1026: sbuf2_i[0] = req_size[idex];
1027: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
1028: MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);
1029: }
1030: }
1031: PetscFree(r_status1);
1032: PetscFree(r_waits1);
1034: /* recv buffer sizes */
1035: /* Receive messages*/
1037: PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);
1038: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
1039: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1040: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1041: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1043: for (i=0; i<nrqs; ++i) {
1044: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1045: PetscMalloc(rbuf2[idex][0]*sizeof(int),&rbuf3[idex]);
1046: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
1047: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,
1048: r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1049: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,
1050: r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1051: }
1052: PetscFree(r_status2);
1053: PetscFree(r_waits2);
1054:
1055: /* Wait on sends1 and sends2 */
1056: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1057: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1059: MPI_Waitall(nrqs,s_waits1,s_status1);
1060: MPI_Waitall(nrqr,s_waits2,s_status2);
1061: PetscFree(s_status1);
1062: PetscFree(s_status2);
1063: PetscFree(s_waits1);
1064: PetscFree(s_waits2);
1066: /* Now allocate buffers for a->j, and send them off */
1067: PetscMalloc((nrqr+1)*sizeof(int *),&sbuf_aj);
1068: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1069: PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);
1070: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1071:
1072: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1073: {
1074: for (i=0; i<nrqr; i++) {
1075: rbuf1_i = rbuf1[i];
1076: sbuf_aj_i = sbuf_aj[i];
1077: ct1 = 2*rbuf1_i[0] + 1;
1078: ct2 = 0;
1079: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1080: kmax = rbuf1[i][2*j];
1081: for (k=0; k<kmax; k++,ct1++) {
1082: row = rbuf1_i[ct1] - rstart;
1083: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1084: ncols = nzA + nzB;
1085: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1087: /* load the column indices for this row into cols*/
1088: cols = sbuf_aj_i + ct2;
1089: for (l=0; l<nzB; l++) {
1090: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
1091: else break;
1092: }
1093: imark = l;
1094: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
1095: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1096: ct2 += ncols;
1097: }
1098: }
1099: MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);
1100: }
1101: }
1102: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1103: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1105: /* Allocate buffers for a->a, and send them off */
1106: PetscMalloc((nrqr+1)*sizeof(MatScalar *),&sbuf_aa);
1107: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1108: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
1109: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1110:
1111: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1112: {
1113: for (i=0; i<nrqr; i++) {
1114: rbuf1_i = rbuf1[i];
1115: sbuf_aa_i = sbuf_aa[i];
1116: ct1 = 2*rbuf1_i[0]+1;
1117: ct2 = 0;
1118: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1119: kmax = rbuf1_i[2*j];
1120: for (k=0; k<kmax; k++,ct1++) {
1121: row = rbuf1_i[ct1] - rstart;
1122: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1123: ncols = nzA + nzB;
1124: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1125: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
1127: /* load the column values for this row into vals*/
1128: vals = sbuf_aa_i+ct2*bs2;
1129: for (l=0; l<nzB; l++) {
1130: if ((bmap[cworkB[l]]) < cstart) {
1131: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1132: }
1133: else break;
1134: }
1135: imark = l;
1136: for (l=0; l<nzA; l++) {
1137: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
1138: }
1139: for (l=imark; l<nzB; l++) {
1140: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
1141: }
1142: ct2 += ncols;
1143: }
1144: }
1145: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
1146: }
1147: }
1148: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1149: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1150: PetscFree(rbuf1);
1152: /* Form the matrix */
1153: /* create col map */
1154: {
1155: int *icol_i;
1156: #if defined (PETSC_USE_CTABLE)
1157: /* Create row map*/
1158: PetscMalloc((1+ismax)*sizeof(PetscTable),&colmaps);
1159: for (i=0; i<ismax+1; i++) {
1160: PetscTableCreate(((i<ismax) ? ncol[i] : ncol[i-1])+1,&colmaps[i]);
1161: }
1162: #else
1163: len = (1+ismax)*sizeof(int*)+ ismax*c->Nbs*sizeof(int);
1164: PetscMalloc(len,&cmap);
1165: cmap[0] = (int *)(cmap + ismax);
1166: PetscMemzero(cmap[0],(1+ismax*c->Nbs)*sizeof(int));
1167: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->Nbs; }
1168: #endif
1169: for (i=0; i<ismax; i++) {
1170: jmax = ncol[i];
1171: icol_i = icol[i];
1172: #if defined (PETSC_USE_CTABLE)
1173: lcol1_gcol1 = colmaps[i];
1174: for (j=0; j<jmax; j++) {
1175: PetscTableAdd(lcol1_gcol1,icol_i[j]+1,j+1);
1176: }
1177: #else
1178: cmap_i = cmap[i];
1179: for (j=0; j<jmax; j++) {
1180: cmap_i[icol_i[j]] = j+1;
1181: }
1182: #endif
1183: }
1184: }
1186: /* Create lens which is required for MatCreate... */
1187: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1188: len = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1189: PetscMalloc(len,&lens);
1190: lens[0] = (int *)(lens + ismax);
1191: PetscMemzero(lens[0],j*sizeof(int));
1192: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1193:
1194: /* Update lens from local data */
1195: for (i=0; i<ismax; i++) {
1196: jmax = nrow[i];
1197: #if defined (PETSC_USE_CTABLE)
1198: lcol1_gcol1 = colmaps[i];
1199: #else
1200: cmap_i = cmap[i];
1201: #endif
1202: irow_i = irow[i];
1203: lens_i = lens[i];
1204: for (j=0; j<jmax; j++) {
1205: row = irow_i[j];
1206: #if defined (PETSC_USE_CTABLE)
1207: PetscGetProc(row,size,c->rowners,&proc);
1208: #else
1209: proc = rtable[row];
1210: #endif
1211: if (proc == rank) {
1212: /* Get indices from matA and then from matB */
1213: row = row - rstart;
1214: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1215: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1216: #if defined (PETSC_USE_CTABLE)
1217: for (k=0; k<nzA; k++) {
1218: PetscTableFind(lcol1_gcol1,cstart+cworkA[k]+1,&tt);
1219: if (tt) { lens_i[j]++; }
1220: }
1221: for (k=0; k<nzB; k++) {
1222: PetscTableFind(lcol1_gcol1,bmap[cworkB[k]]+1,&tt);
1223: if (tt) { lens_i[j]++; }
1224: }
1225: #else
1226: for (k=0; k<nzA; k++) {
1227: if (cmap_i[cstart + cworkA[k]]) { lens_i[j]++; }
1228: }
1229: for (k=0; k<nzB; k++) {
1230: if (cmap_i[bmap[cworkB[k]]]) { lens_i[j]++; }
1231: }
1232: #endif
1233: }
1234: }
1235: }
1236: #if defined (PETSC_USE_CTABLE)
1237: /* Create row map*/
1238: PetscMalloc((1+ismax)*sizeof(PetscTable),&rowmaps);
1239: for (i=0; i<ismax+1; i++){
1240: PetscTableCreate((i<ismax) ? nrow[i] : nrow[i-1],&rowmaps[i]);
1241: }
1242: #else
1243: /* Create row map*/
1244: len = (1+ismax)*sizeof(int*)+ ismax*Mbs*sizeof(int);
1245: PetscMalloc(len,&rmap);
1246: rmap[0] = (int *)(rmap + ismax);
1247: PetscMemzero(rmap[0],ismax*Mbs*sizeof(int));
1248: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + Mbs;}
1249: #endif
1250: for (i=0; i<ismax; i++) {
1251: irow_i = irow[i];
1252: jmax = nrow[i];
1253: #if defined (PETSC_USE_CTABLE)
1254: lrow1_grow1 = rowmaps[i];
1255: for (j=0; j<jmax; j++) {
1256: PetscTableAdd(lrow1_grow1,irow_i[j]+1,j+1);
1257: }
1258: #else
1259: rmap_i = rmap[i];
1260: for (j=0; j<jmax; j++) {
1261: rmap_i[irow_i[j]] = j;
1262: }
1263: #endif
1264: }
1266: /* Update lens from offproc data */
1267: {
1268: int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1270: for (tmp2=0; tmp2<nrqs; tmp2++) {
1271: MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);
1272: idex = pa[i];
1273: sbuf1_i = sbuf1[idex];
1274: jmax = sbuf1_i[0];
1275: ct1 = 2*jmax+1;
1276: ct2 = 0;
1277: rbuf2_i = rbuf2[i];
1278: rbuf3_i = rbuf3[i];
1279: for (j=1; j<=jmax; j++) {
1280: is_no = sbuf1_i[2*j-1];
1281: max1 = sbuf1_i[2*j];
1282: lens_i = lens[is_no];
1283: #if defined (PETSC_USE_CTABLE)
1284: lcol1_gcol1 = colmaps[is_no];
1285: lrow1_grow1 = rowmaps[is_no];
1286: #else
1287: cmap_i = cmap[is_no];
1288: rmap_i = rmap[is_no];
1289: #endif
1290: for (k=0; k<max1; k++,ct1++) {
1291: #if defined (PETSC_USE_CTABLE)
1292: PetscTableFind(lrow1_grow1,sbuf1_i[ct1]+1,&row);
1293: row--;
1294: if(row < 0) { SETERRQ(1,"row not found in table"); }
1295: #else
1296: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1297: #endif
1298: max2 = rbuf2_i[ct1];
1299: for (l=0; l<max2; l++,ct2++) {
1300: #if defined (PETSC_USE_CTABLE)
1301: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tt);
1302: if (tt) {
1303: lens_i[row]++;
1304: }
1305: #else
1306: if (cmap_i[rbuf3_i[ct2]]) {
1307: lens_i[row]++;
1308: }
1309: #endif
1310: }
1311: }
1312: }
1313: }
1314: }
1315: PetscFree(r_status3);
1316: PetscFree(r_waits3);
1317: MPI_Waitall(nrqr,s_waits3,s_status3);
1318: PetscFree(s_status3);
1319: PetscFree(s_waits3);
1321: /* Create the submatrices */
1322: if (scall == MAT_REUSE_MATRIX) {
1323: /*
1324: Assumes new rows are same length as the old rows, hence bug!
1325: */
1326: for (i=0; i<ismax; i++) {
1327: mat = (Mat_SeqBAIJ *)(submats[i]->data);
1328: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || mat->bs != bs)) {
1329: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1330: }
1331: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(int),&flag);
1332: if (flag == PETSC_FALSE) {
1333: SETERRQ(PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1334: }
1335: /* Initial matrix as if empty */
1336: PetscMemzero(mat->ilen,mat->mbs*sizeof(int));
1337: submats[i]->factor = C->factor;
1338: }
1339: } else {
1340: for (i=0; i<ismax; i++) {
1341: MatCreateSeqBAIJ(PETSC_COMM_SELF,a->bs,nrow[i]*bs,ncol[i]*bs,0,lens[i],submats+i);
1342: }
1343: }
1345: /* Assemble the matrices */
1346: /* First assemble the local rows */
1347: {
1348: int ilen_row,*imat_ilen,*imat_j,*imat_i;
1349: MatScalar *imat_a;
1350:
1351: for (i=0; i<ismax; i++) {
1352: mat = (Mat_SeqBAIJ*)submats[i]->data;
1353: imat_ilen = mat->ilen;
1354: imat_j = mat->j;
1355: imat_i = mat->i;
1356: imat_a = mat->a;
1358: #if defined (PETSC_USE_CTABLE)
1359: lcol1_gcol1 = colmaps[i];
1360: lrow1_grow1 = rowmaps[i];
1361: #else
1362: cmap_i = cmap[i];
1363: rmap_i = rmap[i];
1364: #endif
1365: irow_i = irow[i];
1366: jmax = nrow[i];
1367: for (j=0; j<jmax; j++) {
1368: row = irow_i[j];
1369: #if defined (PETSC_USE_CTABLE)
1370: PetscGetProc(row,size,c->rowners,&proc);
1371: #else
1372: proc = rtable[row];
1373: #endif
1374: if (proc == rank) {
1375: row = row - rstart;
1376: nzA = a_i[row+1] - a_i[row];
1377: nzB = b_i[row+1] - b_i[row];
1378: cworkA = a_j + a_i[row];
1379: cworkB = b_j + b_i[row];
1380: vworkA = a_a + a_i[row]*bs2;
1381: vworkB = b_a + b_i[row]*bs2;
1382: #if defined (PETSC_USE_CTABLE)
1383: PetscTableFind(lrow1_grow1,row+rstart+1,&row);
1384: row--;
1385: if (row < 0) { SETERRQ(1,"row not found in table"); }
1386: #else
1387: row = rmap_i[row + rstart];
1388: #endif
1389: mat_i = imat_i[row];
1390: mat_a = imat_a + mat_i*bs2;
1391: mat_j = imat_j + mat_i;
1392: ilen_row = imat_ilen[row];
1394: /* load the column indices for this row into cols*/
1395: for (l=0; l<nzB; l++) {
1396: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1397: #if defined (PETSC_USE_CTABLE)
1398: PetscTableFind(lcol1_gcol1,ctmp+1,&tcol);
1399: if (tcol) {
1400: #else
1401: if ((tcol = cmap_i[ctmp])) {
1402: #endif
1403: *mat_j++ = tcol - 1;
1404: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1405: mat_a += bs2;
1406: ilen_row++;
1407: }
1408: } else break;
1409: }
1410: imark = l;
1411: for (l=0; l<nzA; l++) {
1412: #if defined (PETSC_USE_CTABLE)
1413: PetscTableFind(lcol1_gcol1,cstart+cworkA[l]+1,&tcol);
1414: if (tcol) {
1415: #else
1416: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1417: #endif
1418: *mat_j++ = tcol - 1;
1419: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1420: mat_a += bs2;
1421: ilen_row++;
1422: }
1423: }
1424: for (l=imark; l<nzB; l++) {
1425: #if defined (PETSC_USE_CTABLE)
1426: PetscTableFind(lcol1_gcol1,bmap[cworkB[l]]+1,&tcol);
1427: if (tcol) {
1428: #else
1429: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1430: #endif
1431: *mat_j++ = tcol - 1;
1432: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1433: mat_a += bs2;
1434: ilen_row++;
1435: }
1436: }
1437: imat_ilen[row] = ilen_row;
1438: }
1439: }
1440:
1441: }
1442: }
1444: /* Now assemble the off proc rows*/
1445: {
1446: int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1447: int *imat_j,*imat_i;
1448: MatScalar *imat_a,*rbuf4_i;
1450: for (tmp2=0; tmp2<nrqs; tmp2++) {
1451: MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);
1452: idex = pa[i];
1453: sbuf1_i = sbuf1[idex];
1454: jmax = sbuf1_i[0];
1455: ct1 = 2*jmax + 1;
1456: ct2 = 0;
1457: rbuf2_i = rbuf2[i];
1458: rbuf3_i = rbuf3[i];
1459: rbuf4_i = rbuf4[i];
1460: for (j=1; j<=jmax; j++) {
1461: is_no = sbuf1_i[2*j-1];
1462: #if defined (PETSC_USE_CTABLE)
1463: lrow1_grow1 = rowmaps[is_no];
1464: lcol1_gcol1 = colmaps[is_no];
1465: #else
1466: rmap_i = rmap[is_no];
1467: cmap_i = cmap[is_no];
1468: #endif
1469: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1470: imat_ilen = mat->ilen;
1471: imat_j = mat->j;
1472: imat_i = mat->i;
1473: imat_a = mat->a;
1474: max1 = sbuf1_i[2*j];
1475: for (k=0; k<max1; k++,ct1++) {
1476: row = sbuf1_i[ct1];
1477: #if defined (PETSC_USE_CTABLE)
1478: PetscTableFind(lrow1_grow1,row+1,&row);
1479: row--;
1480: if(row < 0) { SETERRQ(1,"row not found in table"); }
1481: #else
1482: row = rmap_i[row];
1483: #endif
1484: ilen = imat_ilen[row];
1485: mat_i = imat_i[row];
1486: mat_a = imat_a + mat_i*bs2;
1487: mat_j = imat_j + mat_i;
1488: max2 = rbuf2_i[ct1];
1489: for (l=0; l<max2; l++,ct2++) {
1490: #if defined (PETSC_USE_CTABLE)
1491: PetscTableFind(lcol1_gcol1,rbuf3_i[ct2]+1,&tcol);
1492: if (tcol) {
1493: #else
1494: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1495: #endif
1496: *mat_j++ = tcol - 1;
1497: /* *mat_a++= rbuf4_i[ct2]; */
1498: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1499: mat_a += bs2;
1500: ilen++;
1501: }
1502: }
1503: imat_ilen[row] = ilen;
1504: }
1505: }
1506: }
1507: }
1508: PetscFree(r_status4);
1509: PetscFree(r_waits4);
1510: MPI_Waitall(nrqr,s_waits4,s_status4);
1511: PetscFree(s_waits4);
1512: PetscFree(s_status4);
1514: /* Restore the indices */
1515: for (i=0; i<ismax; i++) {
1516: ISRestoreIndices(isrow[i],irow+i);
1517: ISRestoreIndices(iscol[i],icol+i);
1518: }
1520: /* Destroy allocated memory */
1521: PetscFree(irow);
1522: PetscFree(w1);
1523: PetscFree(pa);
1525: PetscFree(sbuf1);
1526: PetscFree(rbuf2);
1527: for (i=0; i<nrqr; ++i) {
1528: PetscFree(sbuf2[i]);
1529: }
1530: for (i=0; i<nrqs; ++i) {
1531: PetscFree(rbuf3[i]);
1532: PetscFree(rbuf4[i]);
1533: }
1535: PetscFree(sbuf2);
1536: PetscFree(rbuf3);
1537: PetscFree(rbuf4);
1538: PetscFree(sbuf_aj[0]);
1539: PetscFree(sbuf_aj);
1540: PetscFree(sbuf_aa[0]);
1541: PetscFree(sbuf_aa);
1543: #if defined (PETSC_USE_CTABLE)
1544: for (i=0; i<ismax+1; i++){
1545: PetscTableDelete(rowmaps[i]);
1546: PetscTableDelete(colmaps[i]);
1547: }
1548: PetscFree(colmaps);
1549: PetscFree(rowmaps);
1550: /* Mark Adams */
1551: #else
1552: PetscFree(rmap);
1553: PetscFree(cmap);
1554: #endif
1555: PetscFree(lens);
1557: for (i=0; i<ismax; i++) {
1558: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1559: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1560: }
1562: return(0);
1563: }