Actual source code: vpscat.c
1: /*$Id: vpscat.c,v 1.159 2001/04/10 19:34:51 bsmith Exp $*/
2: /*
3: Defines parallel vector scatters.
4: */
6: #include petscsys.h
7: #include src/vec/is/isimpl.h
8: #include src/vec/vecimpl.h
9: #include src/vec/impls/dvecimpl.h
10: #include src/vec/impls/mpi/pvecimpl.h
12: int VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
13: {
14: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
15: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
16: int i,rank,ierr;
17: PetscViewerFormat format;
18: PetscTruth isascii;
21: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);
22: if (isascii) {
23: MPI_Comm_rank(ctx->comm,&rank);
24: PetscViewerGetFormat(viewer,&format);
25: if (format == PETSC_VIEWER_ASCII_INFO) {
26: int nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
28: MPI_Reduce(&to->n,&nsend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
29: MPI_Reduce(&from->n,&nrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
30: itmp = to->starts[to->n+1];
31: MPI_Reduce(&itmp,&lensend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
32: itmp = from->starts[from->n+1];
33: MPI_Reduce(&itmp,&lenrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
34: MPI_Reduce(&itmp,&alldata,1,MPI_INT,MPI_SUM,0,ctx->comm);
36: PetscViewerASCIIPrintf(viewer,"VecScatter statisticsn");
37: PetscViewerASCIIPrintf(viewer," Maximum number sends %dn",nsend_max);
38: PetscViewerASCIIPrintf(viewer," Maximum number receives %dn",nrecv_max);
39: PetscViewerASCIIPrintf(viewer," Maximum data sent %dn",lensend_max*to->bs*sizeof(Scalar));
40: PetscViewerASCIIPrintf(viewer," Maximum data received %dn",lenrecv_max*to->bs*sizeof(Scalar));
41: PetscViewerASCIIPrintf(viewer," Total data sent %dn",alldata*to->bs*sizeof(Scalar));
43: } else {
44: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %d; Number to self = %dn",rank,to->n,to->local.n);
45: for (i=0; i<to->n; i++){
46: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d length = %d to whom %dn",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
47: }
49: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indicesn");
50: for (i=0; i<to->starts[to->n]; i++){
51: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,to->indices[i]);
52: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %d; Number from self = %dn",rank,from->n,from->local.n);
55: for (i=0; i<from->n; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d length %d from whom %dn",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
57: }
59: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indicesn");
60: for (i=0; i<from->starts[from->n]; i++){
61: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,from->indices[i]);
62: }
64: PetscViewerFlush(viewer);
65: }
66: } else {
67: SETERRQ1(1,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
68: }
69: return(0);
70: }
72: /* -----------------------------------------------------------------------------------*/
73: /*
74: The next routine determines what part of the local part of the scatter is an
75: exact copy of values into their current location. We check this here and
76: then know that we need not perform that portion of the scatter.
77: */
78: int VecScatterLocalOptimize_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from)
79: {
80: int n = gen_to->n,n_nonmatching = 0,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
81: int *nto_slots,*nfrom_slots,j = 0,ierr;
82:
84: for (i=0; i<n; i++) {
85: if (to_slots[i] != from_slots[i]) n_nonmatching++;
86: }
88: if (!n_nonmatching) {
89: gen_to->nonmatching_computed = PETSC_TRUE;
90: gen_to->n_nonmatching = gen_from->n_nonmatching = 0;
91: PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to 0n");
92: } else if (n_nonmatching == n) {
93: gen_to->nonmatching_computed = PETSC_FALSE;
94: PetscLogInfo(0,"VecScatterLocalOptimize_Private:All values non-matchingn");
95: } else {
96: gen_to->nonmatching_computed= PETSC_TRUE;
97: gen_to->n_nonmatching = gen_from->n_nonmatching = n_nonmatching;
98: PetscMalloc(n_nonmatching*sizeof(int),&nto_slots);
99: gen_to->slots_nonmatching = nto_slots;
100: PetscMalloc(n_nonmatching*sizeof(int),&nfrom_slots);
101: gen_from->slots_nonmatching = nfrom_slots;
102: for (i=0; i<n; i++) {
103: if (to_slots[i] != from_slots[i]) {
104: nto_slots[j] = to_slots[i];
105: nfrom_slots[j] = from_slots[i];
106: j++;
107: }
108: }
109: PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to %dn",n,n_nonmatching);
110: }
111: return(0);
112: }
114: /* --------------------------------------------------------------------------------------*/
115: int VecScatterCopy_PtoP(VecScatter in,VecScatter out)
116: {
117: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
118: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
119: int len,ny,ierr;
122: out->postrecvs = in->postrecvs;
123: out->begin = in->begin;
124: out->end = in->end;
125: out->copy = in->copy;
126: out->destroy = in->destroy;
127: out->view = in->view;
129: /* allocate entire send scatter context */
130: PetscNew(VecScatter_MPI_General,&out_to);
131: PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
132: PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
133: ny = in_to->starts[in_to->n];
134: len = ny*(sizeof(int) + sizeof(Scalar)) + (in_to->n+1)*sizeof(int) +
135: (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
136: out_to->n = in_to->n;
137: out_to->type = in_to->type;
138: out_to->sendfirst = in_to->sendfirst;
139: PetscMalloc(len,&out_to->values);
140: PetscLogObjectMemory(out,len);
141: out_to->requests = (MPI_Request*)(out_to->values + ny);
142: out_to->indices = (int*)(out_to->requests + out_to->n);
143: out_to->starts = (int*)(out_to->indices + ny);
144: out_to->procs = (int*)(out_to->starts + out_to->n + 1);
145: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
146: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
147: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
148: PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
149: out_to->rstatus = out_to->rstatus + PetscMax(in_to->n,in_from->n) + 1;
150: PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n) + 1)*sizeof(MPI_Status));
151: out->todata = (void*)out_to;
152: out_to->local.n = in_to->local.n;
153: out_to->local.nonmatching_computed = PETSC_FALSE;
154: out_to->local.n_nonmatching = 0;
155: out_to->local.slots_nonmatching = 0;
156: if (in_to->local.n) {
157: PetscMalloc(in_to->local.n*sizeof(int),&out_to->local.slots);
158: PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
159: PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
160: } else {
161: out_to->local.slots = 0;
162: }
164: /* allocate entire receive context */
165: PetscNew(VecScatter_MPI_General,&out_from);
166: PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
167: out_from->type = in_from->type;
168: PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
169: ny = in_from->starts[in_from->n];
170: len = ny*(sizeof(int) + sizeof(Scalar)) + (in_from->n+1)*sizeof(int) +
171: (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
172: out_from->n = in_from->n;
173: out_from->sendfirst = in_from->sendfirst;
174: PetscMalloc(len,&out_from->values);
175: PetscLogObjectMemory(out,len);
176: out_from->requests = (MPI_Request*)(out_from->values + ny);
177: out_from->indices = (int*)(out_from->requests + out_from->n);
178: out_from->starts = (int*)(out_from->indices + ny);
179: out_from->procs = (int*)(out_from->starts + out_from->n + 1);
180: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
181: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
182: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
183: out->fromdata = (void*)out_from;
184: out_from->local.n = in_from->local.n;
185: out_from->local.nonmatching_computed = PETSC_FALSE;
186: out_from->local.n_nonmatching = 0;
187: out_from->local.slots_nonmatching = 0;
188: if (in_from->local.n) {
189: PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
190: PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
191: PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
192: } else {
193: out_from->local.slots = 0;
194: }
195: return(0);
196: }
198: /* -------------------------------------------------------------------------------------*/
199: int VecScatterDestroy_PtoP(VecScatter ctx)
200: {
201: VecScatter_MPI_General *gen_to = (VecScatter_MPI_General*)ctx->todata;
202: VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
206: if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
207: if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
208: if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
209: if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}
210: PetscFree(gen_to->sstatus);
211: PetscFree(gen_to->values);
212: PetscFree(gen_to);
213: PetscFree(gen_from->values);
214: PetscFree(gen_from);
215: PetscLogObjectDestroy(ctx);
216: PetscHeaderDestroy(ctx);
217: return(0);
218: }
220: /* --------------------------------------------------------------------------------------*/
221: /*
222: Even though the next routines are written with parallel
223: vectors, either xin or yin (but not both) may be Seq
224: vectors, one for each processor.
225:
226: gen_from indices indicate where arriving stuff is stashed
227: gen_to indices indicate where departing stuff came from.
228: the naming can be VERY confusing.
230: */
231: int VecScatterBegin_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
232: {
233: VecScatter_MPI_General *gen_to,*gen_from;
234: MPI_Comm comm = ctx->comm;
235: Scalar *xv,*yv,*val,*rvalues,*svalues;
236: MPI_Request *rwaits,*swaits;
237: int tag = ctx->tag,i,j,*indices,*rstarts,*sstarts,*rprocs,*sprocs;
238: int nrecvs,nsends,iend,ierr;
241: VecGetArray(xin,&xv);
242: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
243: if (mode & SCATTER_REVERSE){
244: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
245: gen_from = (VecScatter_MPI_General*)ctx->todata;
246: } else {
247: gen_to = (VecScatter_MPI_General*)ctx->todata;
248: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
249: }
250: rvalues = gen_from->values;
251: svalues = gen_to->values;
252: nrecvs = gen_from->n;
253: nsends = gen_to->n;
254: rwaits = gen_from->requests;
255: swaits = gen_to->requests;
256: indices = gen_to->indices;
257: rstarts = gen_from->starts;
258: sstarts = gen_to->starts;
259: rprocs = gen_from->procs;
260: sprocs = gen_to->procs;
262: if (!(mode & SCATTER_LOCAL)) {
264: if (gen_to->sendfirst) {
265: /* do sends: */
266: for (i=0; i<nsends; i++) {
267: val = svalues + sstarts[i];
268: iend = sstarts[i+1]-sstarts[i];
269: /* pack the message */
270: for (j=0; j<iend; j++) {
271: val[j] = xv[*indices++];
272: }
273: MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
274: }
275: }
276:
277: /* post receives: */
278: for (i=0; i<nrecvs; i++) {
279: MPI_Irecv(rvalues+rstarts[i],rstarts[i+1]-rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
280: }
282: if (!gen_to->sendfirst) {
283: /* do sends: */
284: for (i=0; i<nsends; i++) {
285: val = svalues + sstarts[i];
286: iend = sstarts[i+1]-sstarts[i];
287: /* pack the message */
288: for (j=0; j<iend; j++) {
289: val[j] = xv[*indices++];
290: }
291: MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
292: }
293: }
294: }
296: /* take care of local scatters */
297: if (gen_to->local.n && addv == INSERT_VALUES) {
298: if (yv == xv && !gen_to->local.nonmatching_computed) {
299: VecScatterLocalOptimize_Private(&gen_to->local,&gen_from->local);
300: }
301: if (gen_to->local.is_copy) {
302: PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
303: } else if (yv != xv || gen_to->local.nonmatching_computed == -1) {
304: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
305: int n = gen_to->local.n;
306: for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
307: } else {
308: /*
309: In this case, it is copying the values into their old locations, thus we can skip those
310: */
311: int *tslots = gen_to->local.slots_nonmatching,*fslots = gen_from->local.slots_nonmatching;
312: int n = gen_to->local.n_nonmatching;
313: for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
314: }
315: } else if (gen_to->local.n) {
316: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
317: int n = gen_to->local.n;
318: if (addv == ADD_VALUES) {
319: for (i=0; i<n; i++) {yv[fslots[i]] += xv[tslots[i]];}
320: #if !defined(PETSC_USE_COMPLEX)
321: } else if (addv == MAX_VALUES) {
322: for (i=0; i<n; i++) {yv[fslots[i]] = PetscMax(yv[fslots[i]],xv[tslots[i]]);}
323: #endif
324: } else {SETERRQ(1,"Wrong insert option");}
325: }
327: VecRestoreArray(xin,&xv);
328: if (xin != yin) {VecRestoreArray(yin,&yv);}
329: return(0);
330: }
332: /* --------------------------------------------------------------------------------------*/
333: int VecScatterEnd_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
334: {
335: VecScatter_MPI_General *gen_to,*gen_from;
336: Scalar *rvalues,*yv,*val;
337: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices;
338: MPI_Request *rwaits,*swaits;
339: MPI_Status rstatus,*sstatus;
342: if (mode & SCATTER_LOCAL) return(0);
343: VecGetArray(yin,&yv);
345: if (mode & SCATTER_REVERSE){
346: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
347: gen_from = (VecScatter_MPI_General*)ctx->todata;
348: sstatus = gen_from->sstatus;
349: } else {
350: gen_to = (VecScatter_MPI_General*)ctx->todata;
351: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
352: sstatus = gen_to->sstatus;
353: }
354: rvalues = gen_from->values;
355: nrecvs = gen_from->n;
356: nsends = gen_to->n;
357: rwaits = gen_from->requests;
358: swaits = gen_to->requests;
359: indices = gen_from->indices;
360: rstarts = gen_from->starts;
362: /* wait on receives */
363: count = nrecvs;
364: while (count) {
365: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
366: /* unpack receives into our local space */
367: val = rvalues + rstarts[imdex];
368: n = rstarts[imdex+1]-rstarts[imdex];
369: lindices = indices + rstarts[imdex];
370: if (addv == INSERT_VALUES) {
371: for (i=0; i<n; i++) {
372: yv[lindices[i]] = *val++;
373: }
374: } else if (addv == ADD_VALUES) {
375: for (i=0; i<n; i++) {
376: yv[lindices[i]] += *val++;
377: }
378: #if !defined(PETSC_USE_COMPLEX)
379: } else if (addv == MAX_VALUES) {
380: for (i=0; i<n; i++) {
381: yv[lindices[i]] = PetscMax(yv[lindices[i]],*val); val++;
382: }
383: #endif
384: } else {SETERRQ(1,"Wrong insert option");}
385: count--;
386: }
388: /* wait on sends */
389: if (nsends) {
390: MPI_Waitall(nsends,swaits,sstatus);
391: }
392: VecRestoreArray(yin,&yv);
393: return(0);
394: }
395: /* ==========================================================================================*/
396: /*
397: Special scatters for fixed block sizes. These provide better performance
398: because the local copying and packing and unpacking are done with loop
399: unrolling to the size of the block.
401: Also uses MPI persistent sends and receives, these (at least in theory)
402: allow MPI to optimize repeated sends and receives of the same type.
403: */
405: /*
406: This is for use with the "ready-receiver" mode. In theory on some
407: machines it could lead to better performance. In practice we've never
408: seen it give better performance. Accessed with the -vecscatter_rr flag.
409: */
410: int VecScatterPostRecvs_PtoP_X(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
411: {
412: VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
415: MPI_Startall_irecv(gen_from->starts[gen_from->n]*gen_from->bs,gen_from->n,gen_from->requests);
416: return(0);
417: }
419: /* --------------------------------------------------------------------------------------*/
420: /*
421: Special optimization to see if the local part of the scatter is actually
422: a copy. The scatter routines call PetscMemcpy() instead.
423:
424: */
425: int VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from,int bs)
426: {
427: int n = gen_to->n,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
428: int to_start,from_start;
429:
431: to_start = to_slots[0];
432: from_start = from_slots[0];
434: for (i=1; i<n; i++) {
435: to_start += bs;
436: from_start += bs;
437: if (to_slots[i] != to_start) return(0);
438: if (from_slots[i] != from_start) return(0);
439: }
440: gen_to->is_copy = PETSC_TRUE;
441: gen_to->copy_start = to_slots[0];
442: gen_to->copy_length = bs*sizeof(Scalar)*n;
443: gen_from->is_copy = PETSC_TRUE;
444: gen_from->copy_start = from_slots[0];
445: gen_from->copy_length = bs*sizeof(Scalar)*n;
447: PetscLogInfo(0,"VecScatterLocalOptimizeCopy_Private:Local scatter is a copy, optimizing for itn");
449: return(0);
450: }
452: /* --------------------------------------------------------------------------------------*/
454: int VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
455: {
456: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
457: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
458: int len,ny,bs = in_from->bs,ierr;
461: out->postrecvs = in->postrecvs;
462: out->begin = in->begin;
463: out->end = in->end;
464: out->copy = in->copy;
465: out->destroy = in->destroy;
466: out->view = in->view;
468: /* allocate entire send scatter context */
469: PetscNew(VecScatter_MPI_General,&out_to);
470: PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
471: PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
472: ny = in_to->starts[in_to->n];
473: len = ny*(sizeof(int) + bs*sizeof(Scalar)) + (in_to->n+1)*sizeof(int) +
474: (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
475: out_to->n = in_to->n;
476: out_to->type = in_to->type;
477: out_to->sendfirst = in_to->sendfirst;
479: PetscMalloc(len,&out_to->values);
480: PetscLogObjectMemory(out,len);
481: out_to->requests = (MPI_Request*)(out_to->values + bs*ny);
482: out_to->indices = (int*)(out_to->requests + out_to->n);
483: out_to->starts = (int*)(out_to->indices + ny);
484: out_to->procs = (int*)(out_to->starts + out_to->n + 1);
485: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
486: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
487: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
488: PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
489: out_to->rstatus = out_to->sstatus + PetscMax(in_to->n,in_from->n) + 1;
490:
491: PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status));
492: out->todata = (void*)out_to;
493: out_to->local.n = in_to->local.n;
494: out_to->local.nonmatching_computed = PETSC_FALSE;
495: out_to->local.n_nonmatching = 0;
496: out_to->local.slots_nonmatching = 0;
497: if (in_to->local.n) {
498: PetscMalloc(in_to->local.n*sizeof(int),out_to->local.slots);
499: PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
500: PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
501: } else {
502: out_to->local.slots = 0;
503: }
505: /* allocate entire receive context */
506: PetscNew(VecScatter_MPI_General,&out_from);
507: PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
508: out_from->type = in_from->type;
509: PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
510: ny = in_from->starts[in_from->n];
511: len = ny*(sizeof(int) + bs*sizeof(Scalar)) + (in_from->n+1)*sizeof(int) +
512: (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
513: out_from->n = in_from->n;
514: out_from->sendfirst = in_from->sendfirst;
515: PetscMalloc(len,&out_from->values);
516: PetscLogObjectMemory(out,len);
517: out_from->requests = (MPI_Request*)(out_from->values + bs*ny);
518: out_from->indices = (int*)(out_from->requests + out_from->n);
519: out_from->starts = (int*)(out_from->indices + ny);
520: out_from->procs = (int*)(out_from->starts + out_from->n + 1);
521: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
522: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
523: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
524: out->fromdata = (void*)out_from;
525: out_from->local.n = in_from->local.n;
526: out_from->local.nonmatching_computed = PETSC_FALSE;
527: out_from->local.n_nonmatching = 0;
528: out_from->local.slots_nonmatching = 0;
529: if (in_from->local.n) {
530: PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
531: PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
532: PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
533: } else {
534: out_from->local.slots = 0;
535: }
537: /*
538: set up the request arrays for use with isend_init() and irecv_init()
539: */
540: {
541: MPI_Comm comm;
542: int *sstarts = out_to->starts, *rstarts = out_from->starts;
543: int *sprocs = out_to->procs, *rprocs = out_from->procs;
544: int tag,i;
545: PetscTruth flg;
546: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
547: MPI_Request *rev_swaits,*rev_rwaits;
548: Scalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
550: PetscMalloc((in_to->n+1)*sizeof(MPI_Request),&out_to->rev_requests);
551: PetscMalloc((in_from->n+1)*sizeof(MPI_Request),&out_from->rev_requests);
552: PetscLogObjectMemory(out,(in_to->n+in_from->n+2)*sizeof(MPI_Request));
554: rev_rwaits = out_to->rev_requests;
555: rev_swaits = out_from->rev_requests;
557: out_from->bs = out_to->bs = bs;
558: tag = out->tag;
559: comm = out->comm;
561: /* Register the receives that you will use later (sends for scatter reverse) */
562: for (i=0; i<out_from->n; i++) {
563: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
564: comm,rwaits+i);
565: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
566: comm,rev_swaits+i);
567: }
569: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
570: if (flg) {
571: out->postrecvs = VecScatterPostRecvs_PtoP_X;
572: out_to->use_readyreceiver = PETSC_TRUE;
573: out_from->use_readyreceiver = PETSC_TRUE;
574: for (i=0; i<out_to->n; i++) {
575: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
576: comm,swaits+i);
577: }
578: PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter ready receiver moden");
579: } else {
580: out->postrecvs = 0;
581: out_to->use_readyreceiver = PETSC_FALSE;
582: out_from->use_readyreceiver = PETSC_FALSE;
583: flg = PETSC_FALSE;
584: ierr = PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
585: if (flg) {
586: PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter Ssend moden");
587: }
588: for (i=0; i<out_to->n; i++) {
589: if (!flg) {
590: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
591: comm,swaits+i);
592: } else {
593: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
594: comm,swaits+i);
595: }
596: }
597: }
598: /* Register receives for scatter reverse */
599: for (i=0; i<out_to->n; i++) {
600: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
601: comm,rev_rwaits+i);
602: }
603: }
605: return(0);
606: }
608: /* --------------------------------------------------------------------------------------*/
610: int VecScatterBegin_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
611: {
612: VecScatter_MPI_General *gen_to,*gen_from;
613: Scalar *xv,*yv,*val,*svalues;
614: MPI_Request *rwaits,*swaits;
615: int *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;
618: VecGetArray(xin,&xv);
619: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
621: if (mode & SCATTER_REVERSE) {
622: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
623: gen_from = (VecScatter_MPI_General*)ctx->todata;
624: rwaits = gen_from->rev_requests;
625: swaits = gen_to->rev_requests;
626: } else {
627: gen_to = (VecScatter_MPI_General*)ctx->todata;
628: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
629: rwaits = gen_from->requests;
630: swaits = gen_to->requests;
631: }
632: svalues = gen_to->values;
633: nrecvs = gen_from->n;
634: nsends = gen_to->n;
635: indices = gen_to->indices;
636: sstarts = gen_to->starts;
638: if (!(mode & SCATTER_LOCAL)) {
640: if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
641: /* post receives since they were not posted in VecScatterPostRecvs() */
642: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
643: }
644: if (ctx->packtogether) {
645: /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
646: len = 12*sstarts[nsends];
647: val = svalues;
648: for (i=0; i<len; i += 12) {
649: idx = *indices++;
650: val[0] = xv[idx];
651: val[1] = xv[idx+1];
652: val[2] = xv[idx+2];
653: val[3] = xv[idx+3];
654: val[4] = xv[idx+4];
655: val[5] = xv[idx+5];
656: val[6] = xv[idx+6];
657: val[7] = xv[idx+7];
658: val[8] = xv[idx+8];
659: val[9] = xv[idx+9];
660: val[10] = xv[idx+10];
661: val[11] = xv[idx+11];
662: val += 12;
663: }
664: MPI_Startall_isend(len,nsends,swaits);
665: } else {
666: /* this version packs and sends one at a time */
667: val = svalues;
668: for (i=0; i<nsends; i++) {
669: iend = sstarts[i+1]-sstarts[i];
671: for (j=0; j<iend; j++) {
672: idx = *indices++;
673: val[0] = xv[idx];
674: val[1] = xv[idx+1];
675: val[2] = xv[idx+2];
676: val[3] = xv[idx+3];
677: val[4] = xv[idx+4];
678: val[5] = xv[idx+5];
679: val[6] = xv[idx+6];
680: val[7] = xv[idx+7];
681: val[8] = xv[idx+8];
682: val[9] = xv[idx+9];
683: val[10] = xv[idx+10];
684: val[11] = xv[idx+11];
685: val += 12;
686: }
687: MPI_Start_isend(12*iend,swaits+i);
688: }
689: }
691: if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
692: /* post receives since they were not posted in VecScatterPostRecvs() */
693: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
694: }
695: }
697: /* take care of local scatters */
698: if (gen_to->local.n) {
699: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
700: int n = gen_to->local.n,il,ir;
701: if (addv == INSERT_VALUES) {
702: if (gen_to->local.is_copy) {
703: PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
704: } else {
705: for (i=0; i<n; i++) {
706: il = fslots[i]; ir = tslots[i];
707: yv[il] = xv[ir];
708: yv[il+1] = xv[ir+1];
709: yv[il+2] = xv[ir+2];
710: yv[il+3] = xv[ir+3];
711: yv[il+4] = xv[ir+4];
712: yv[il+5] = xv[ir+5];
713: yv[il+6] = xv[ir+6];
714: yv[il+7] = xv[ir+7];
715: yv[il+8] = xv[ir+8];
716: yv[il+9] = xv[ir+9];
717: yv[il+10] = xv[ir+10];
718: yv[il+11] = xv[ir+11];
719: }
720: }
721: } else if (addv == ADD_VALUES) {
722: for (i=0; i<n; i++) {
723: il = fslots[i]; ir = tslots[i];
724: yv[il] += xv[ir];
725: yv[il+1] += xv[ir+1];
726: yv[il+2] += xv[ir+2];
727: yv[il+3] += xv[ir+3];
728: yv[il+4] += xv[ir+4];
729: yv[il+5] += xv[ir+5];
730: yv[il+6] += xv[ir+6];
731: yv[il+7] += xv[ir+7];
732: yv[il+8] += xv[ir+8];
733: yv[il+9] += xv[ir+9];
734: yv[il+10] += xv[ir+10];
735: yv[il+11] += xv[ir+11];
736: }
737: #if !defined(PETSC_USE_COMPLEX)
738: } else if (addv == MAX_VALUES) {
739: for (i=0; i<n; i++) {
740: il = fslots[i]; ir = tslots[i];
741: yv[il] = PetscMax(yv[il],xv[ir]);
742: yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
743: yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
744: yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
745: yv[il+4] = PetscMax(yv[il+4],xv[ir+4]);
746: yv[il+5] = PetscMax(yv[il+5],xv[ir+5]);
747: yv[il+6] = PetscMax(yv[il+6],xv[ir+6]);
748: yv[il+7] = PetscMax(yv[il+7],xv[ir+7]);
749: yv[il+8] = PetscMax(yv[il+8],xv[ir+8]);
750: yv[il+9] = PetscMax(yv[il+9],xv[ir+9]);
751: yv[il+10] = PetscMax(yv[il+10],xv[ir+10]);
752: yv[il+11] = PetscMax(yv[il+11],xv[ir+11]);
753: }
754: #endif
755: } else {SETERRQ(1,"Wrong insert option");}
756: }
757: VecRestoreArray(xin,&xv);
758: if (xin != yin) {VecRestoreArray(yin,&yv);}
759: return(0);
760: }
762: /* --------------------------------------------------------------------------------------*/
764: int VecScatterEnd_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
765: {
766: VecScatter_MPI_General *gen_to,*gen_from;
767: Scalar *rvalues,*yv,*val;
768: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
769: MPI_Request *rwaits,*swaits;
770: MPI_Status *rstatus,*sstatus;
773: if (mode & SCATTER_LOCAL) return(0);
774: VecGetArray(yin,&yv);
776: if (mode & SCATTER_REVERSE) {
777: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
778: gen_from = (VecScatter_MPI_General*)ctx->todata;
779: rwaits = gen_from->rev_requests;
780: swaits = gen_to->rev_requests;
781: sstatus = gen_from->sstatus;
782: rstatus = gen_from->rstatus;
783: } else {
784: gen_to = (VecScatter_MPI_General*)ctx->todata;
785: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
786: rwaits = gen_from->requests;
787: swaits = gen_to->requests;
788: sstatus = gen_to->sstatus;
789: rstatus = gen_to->rstatus;
790: }
791: rvalues = gen_from->values;
792: nrecvs = gen_from->n;
793: nsends = gen_to->n;
794: indices = gen_from->indices;
795: rstarts = gen_from->starts;
797: /* wait on receives */
798: count = nrecvs;
799: if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
800: ierr = MPI_Waitall(nrecvs,rwaits,rstatus);
801: n = rstarts[count];
802: val = rvalues;
803: lindices = indices;
804: if (addv == INSERT_VALUES) {
805: for (i=0; i<n; i++) {
806: idx = lindices[i];
807: yv[idx] = val[0];
808: yv[idx+1] = val[1];
809: yv[idx+2] = val[2];
810: yv[idx+3] = val[3];
811: yv[idx+4] = val[4];
812: yv[idx+5] = val[5];
813: yv[idx+6] = val[6];
814: yv[idx+7] = val[7];
815: yv[idx+8] = val[8];
816: yv[idx+9] = val[9];
817: yv[idx+10] = val[10];
818: yv[idx+11] = val[11];
819: val += 12;
820: }
821: } else if (addv == ADD_VALUES) {
822: for (i=0; i<n; i++) {
823: idx = lindices[i];
824: yv[idx] += val[0];
825: yv[idx+1] += val[1];
826: yv[idx+2] += val[2];
827: yv[idx+3] += val[3];
828: yv[idx+4] += val[4];
829: yv[idx+5] += val[5];
830: yv[idx+6] += val[6];
831: yv[idx+7] += val[7];
832: yv[idx+8] += val[8];
833: yv[idx+9] += val[9];
834: yv[idx+10] += val[10];
835: yv[idx+11] += val[11];
836: val += 12;
837: }
838: #if !defined(PETSC_USE_COMPLEX)
839: } else if (addv == MAX_VALUES) {
840: for (i=0; i<n; i++) {
841: idx = lindices[i];
842: yv[idx] = PetscMax(yv[idx],val[0]);
843: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
844: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
845: yv[idx+3] = PetscMax(yv[idx+3],val[3]);
846: yv[idx+4] = PetscMax(yv[idx+4],val[4]);
847: yv[idx+5] = PetscMax(yv[idx+5],val[5]);
848: yv[idx+6] = PetscMax(yv[idx+6],val[6]);
849: yv[idx+7] = PetscMax(yv[idx+7],val[7]);
850: yv[idx+8] = PetscMax(yv[idx+8],val[8]);
851: yv[idx+9] = PetscMax(yv[idx+9],val[9]);
852: yv[idx+10] = PetscMax(yv[idx+10],val[10]);
853: yv[idx+11] = PetscMax(yv[idx+11],val[11]);
854: val += 12;
855: }
856: #endif
857: } else {SETERRQ(1,"Wrong insert option");}
858: } else { /* unpack each message as it arrives, default version */
859: while (count) {
860: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
861: /* unpack receives into our local space */
862: val = rvalues + 12*rstarts[imdex];
863: lindices = indices + rstarts[imdex];
864: n = rstarts[imdex+1] - rstarts[imdex];
865: if (addv == INSERT_VALUES) {
866: for (i=0; i<n; i++) {
867: idx = lindices[i];
868: yv[idx] = val[0];
869: yv[idx+1] = val[1];
870: yv[idx+2] = val[2];
871: yv[idx+3] = val[3];
872: yv[idx+4] = val[4];
873: yv[idx+5] = val[5];
874: yv[idx+6] = val[6];
875: yv[idx+7] = val[7];
876: yv[idx+8] = val[8];
877: yv[idx+9] = val[9];
878: yv[idx+10] = val[10];
879: yv[idx+11] = val[11];
880: val += 12;
881: }
882: } else if (addv == ADD_VALUES) {
883: for (i=0; i<n; i++) {
884: idx = lindices[i];
885: yv[idx] += val[0];
886: yv[idx+1] += val[1];
887: yv[idx+2] += val[2];
888: yv[idx+3] += val[3];
889: yv[idx+4] += val[4];
890: yv[idx+5] += val[5];
891: yv[idx+6] += val[6];
892: yv[idx+7] += val[7];
893: yv[idx+8] += val[8];
894: yv[idx+9] += val[9];
895: yv[idx+10] += val[10];
896: yv[idx+11] += val[11];
897: val += 12;
898: }
899: #if !defined(PETSC_USE_COMPLEX)
900: } else if (addv == MAX_VALUES) {
901: for (i=0; i<n; i++) {
902: idx = lindices[i];
903: yv[idx] = PetscMax(yv[idx],val[0]);
904: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
905: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
906: yv[idx+3] = PetscMax(yv[idx+3],val[3]);
907: yv[idx+4] = PetscMax(yv[idx+4],val[4]);
908: yv[idx+5] = PetscMax(yv[idx+5],val[5]);
909: yv[idx+6] = PetscMax(yv[idx+6],val[6]);
910: yv[idx+7] = PetscMax(yv[idx+7],val[7]);
911: yv[idx+8] = PetscMax(yv[idx+8],val[8]);
912: yv[idx+9] = PetscMax(yv[idx+9],val[9]);
913: yv[idx+10] = PetscMax(yv[idx+10],val[10]);
914: yv[idx+11] = PetscMax(yv[idx+11],val[11]);
915: val += 12;
916: }
917: #endif
918: } else {SETERRQ(1,"Wrong insert option");}
919: count--;
920: }
921: }
922: /* wait on sends */
923: if (nsends) {
924: MPI_Waitall(nsends,swaits,sstatus);
925: }
926: VecRestoreArray(yin,&yv);
927: return(0);
928: }
930: /* --------------------------------------------------------------------------------------*/
932: int VecScatterBegin_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
933: {
934: VecScatter_MPI_General *gen_to,*gen_from;
935: Scalar *xv,*yv,*val,*svalues;
936: MPI_Request *rwaits,*swaits;
937: int ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;
940: VecGetArray(xin,&xv);
941: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
942: if (mode & SCATTER_REVERSE) {
943: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
944: gen_from = (VecScatter_MPI_General*)ctx->todata;
945: rwaits = gen_from->rev_requests;
946: swaits = gen_to->rev_requests;
947: } else {
948: gen_to = (VecScatter_MPI_General*)ctx->todata;
949: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
950: rwaits = gen_from->requests;
951: swaits = gen_to->requests;
952: }
953: svalues = gen_to->values;
954: nrecvs = gen_from->n;
955: nsends = gen_to->n;
956: indices = gen_to->indices;
957: sstarts = gen_to->starts;
959: if (!(mode & SCATTER_LOCAL)) {
961: if (gen_to->sendfirst) {
962: /* this version packs and sends one at a time */
963: val = svalues;
964: for (i=0; i<nsends; i++) {
965: iend = sstarts[i+1]-sstarts[i];
967: for (j=0; j<iend; j++) {
968: idx = *indices++;
969: val[0] = xv[idx];
970: val[1] = xv[idx+1];
971: val[2] = xv[idx+2];
972: val[3] = xv[idx+3];
973: val[4] = xv[idx+4];
974: val += 5;
975: }
976: MPI_Start_isend(5*iend,swaits+i);
977: }
978: }
980: if (!gen_from->use_readyreceiver) {
981: /* post receives since they were not posted in VecScatterPostRecvs() */
982: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
983: }
985: if (!gen_to->sendfirst) {
986: /* this version packs all the messages together and sends */
987: /*
988: len = 5*sstarts[nsends];
989: val = svalues;
990: for (i=0; i<len; i += 5) {
991: idx = *indices++;
992: val[0] = xv[idx];
993: val[1] = xv[idx+1];
994: val[2] = xv[idx+2];
995: val[3] = xv[idx+3];
996: val[4] = xv[idx+4];
997: val += 5;
998: }
999: MPI_Startall_isend(len,nsends,swaits);
1000: */
1002: /* this version packs and sends one at a time */
1003: val = svalues;
1004: for (i=0; i<nsends; i++) {
1005: iend = sstarts[i+1]-sstarts[i];
1007: for (j=0; j<iend; j++) {
1008: idx = *indices++;
1009: val[0] = xv[idx];
1010: val[1] = xv[idx+1];
1011: val[2] = xv[idx+2];
1012: val[3] = xv[idx+3];
1013: val[4] = xv[idx+4];
1014: val += 5;
1015: }
1016: MPI_Start_isend(5*iend,swaits+i);
1017: }
1018: }
1019: }
1021: /* take care of local scatters */
1022: if (gen_to->local.n) {
1023: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1024: int n = gen_to->local.n,il,ir;
1025: if (addv == INSERT_VALUES) {
1026: if (gen_to->local.is_copy) {
1027: PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1028: } else {
1029: for (i=0; i<n; i++) {
1030: il = fslots[i]; ir = tslots[i];
1031: yv[il] = xv[ir];
1032: yv[il+1] = xv[ir+1];
1033: yv[il+2] = xv[ir+2];
1034: yv[il+3] = xv[ir+3];
1035: yv[il+4] = xv[ir+4];
1036: }
1037: }
1038: } else if (addv == ADD_VALUES) {
1039: for (i=0; i<n; i++) {
1040: il = fslots[i]; ir = tslots[i];
1041: yv[il] += xv[ir];
1042: yv[il+1] += xv[ir+1];
1043: yv[il+2] += xv[ir+2];
1044: yv[il+3] += xv[ir+3];
1045: yv[il+4] += xv[ir+4];
1046: }
1047: #if !defined(PETSC_USE_COMPLEX)
1048: } else if (addv == MAX_VALUES) {
1049: for (i=0; i<n; i++) {
1050: il = fslots[i]; ir = tslots[i];
1051: yv[il] = PetscMax(yv[il],xv[ir]);
1052: yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1053: yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1054: yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1055: yv[il+4] = PetscMax(yv[il+4],xv[ir+4]);
1056: }
1057: #endif
1058: } else {SETERRQ(1,"Wrong insert option");}
1059: }
1060: VecRestoreArray(xin,&xv);
1061: if (xin != yin) {VecRestoreArray(yin,&yv);}
1062: return(0);
1063: }
1065: /* --------------------------------------------------------------------------------------*/
1067: int VecScatterEnd_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1068: {
1069: VecScatter_MPI_General *gen_to,*gen_from;
1070: Scalar *rvalues,*yv,*val;
1071: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1072: MPI_Request *rwaits,*swaits;
1073: MPI_Status rstatus,*sstatus;
1076: if (mode & SCATTER_LOCAL) return(0);
1077: VecGetArray(yin,&yv);
1079: if (mode & SCATTER_REVERSE) {
1080: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1081: gen_from = (VecScatter_MPI_General*)ctx->todata;
1082: rwaits = gen_from->rev_requests;
1083: swaits = gen_to->rev_requests;
1084: sstatus = gen_from->sstatus;
1085: } else {
1086: gen_to = (VecScatter_MPI_General*)ctx->todata;
1087: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1088: rwaits = gen_from->requests;
1089: swaits = gen_to->requests;
1090: sstatus = gen_to->sstatus;
1091: }
1092: rvalues = gen_from->values;
1093: nrecvs = gen_from->n;
1094: nsends = gen_to->n;
1095: indices = gen_from->indices;
1096: rstarts = gen_from->starts;
1098: /* wait on receives */
1099: count = nrecvs;
1100: while (count) {
1101: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1102: /* unpack receives into our local space */
1103: val = rvalues + 5*rstarts[imdex];
1104: lindices = indices + rstarts[imdex];
1105: n = rstarts[imdex+1] - rstarts[imdex];
1106: if (addv == INSERT_VALUES) {
1107: for (i=0; i<n; i++) {
1108: idx = lindices[i];
1109: yv[idx] = val[0];
1110: yv[idx+1] = val[1];
1111: yv[idx+2] = val[2];
1112: yv[idx+3] = val[3];
1113: yv[idx+4] = val[4];
1114: val += 5;
1115: }
1116: } else if (addv == ADD_VALUES) {
1117: for (i=0; i<n; i++) {
1118: idx = lindices[i];
1119: yv[idx] += val[0];
1120: yv[idx+1] += val[1];
1121: yv[idx+2] += val[2];
1122: yv[idx+3] += val[3];
1123: yv[idx+4] += val[4];
1124: val += 5;
1125: }
1126: #if !defined(PETSC_USE_COMPLEX)
1127: } else if (addv == MAX_VALUES) {
1128: for (i=0; i<n; i++) {
1129: idx = lindices[i];
1130: yv[idx] = PetscMax(yv[idx],val[0]);
1131: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1132: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1133: yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1134: yv[idx+4] = PetscMax(yv[idx+4],val[4]);
1135: val += 5;
1136: }
1137: #endif
1138: } else {SETERRQ(1,"Wrong insert option");}
1139: count--;
1140: }
1141: /* wait on sends */
1142: if (nsends) {
1143: MPI_Waitall(nsends,swaits,sstatus);
1144: }
1145: VecRestoreArray(yin,&yv);
1146: return(0);
1147: }
1149: /* --------------------------------------------------------------------------------------*/
1151: int VecScatterBegin_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1152: {
1153: VecScatter_MPI_General *gen_to,*gen_from;
1154: Scalar *xv,*yv,*val,*svalues;
1155: MPI_Request *rwaits,*swaits;
1156: int *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;
1159: VecGetArray(xin,&xv);
1160: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1162: if (mode & SCATTER_REVERSE) {
1163: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1164: gen_from = (VecScatter_MPI_General*)ctx->todata;
1165: rwaits = gen_from->rev_requests;
1166: swaits = gen_to->rev_requests;
1167: } else {
1168: gen_to = (VecScatter_MPI_General*)ctx->todata;
1169: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1170: rwaits = gen_from->requests;
1171: swaits = gen_to->requests;
1172: }
1173: svalues = gen_to->values;
1174: nrecvs = gen_from->n;
1175: nsends = gen_to->n;
1176: indices = gen_to->indices;
1177: sstarts = gen_to->starts;
1179: if (!(mode & SCATTER_LOCAL)) {
1181: if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
1182: /* post receives since they were not posted in VecScatterPostRecvs() */
1183: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1184: }
1186: if (ctx->packtogether) {
1187: /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
1188: len = 4*sstarts[nsends];
1189: val = svalues;
1190: for (i=0; i<len; i += 4) {
1191: idx = *indices++;
1192: val[0] = xv[idx];
1193: val[1] = xv[idx+1];
1194: val[2] = xv[idx+2];
1195: val[3] = xv[idx+3];
1196: val += 4;
1197: }
1198: MPI_Startall_isend(len,nsends,swaits);
1199: } else {
1200: /* this version packs and sends one at a time, default */
1201: val = svalues;
1202: for (i=0; i<nsends; i++) {
1203: iend = sstarts[i+1]-sstarts[i];
1205: for (j=0; j<iend; j++) {
1206: idx = *indices++;
1207: val[0] = xv[idx];
1208: val[1] = xv[idx+1];
1209: val[2] = xv[idx+2];
1210: val[3] = xv[idx+3];
1211: val += 4;
1212: }
1213: MPI_Start_isend(4*iend,swaits+i);
1214: }
1215: }
1217: if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
1218: /* post receives since they were not posted in VecScatterPostRecvs() */
1219: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1220: }
1221: }
1223: /* take care of local scatters */
1224: if (gen_to->local.n) {
1225: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1226: int n = gen_to->local.n,il,ir;
1227: if (addv == INSERT_VALUES) {
1228: if (gen_to->local.is_copy) {
1229: PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
1230: } else {
1231: for (i=0; i<n; i++) {
1232: il = fslots[i]; ir = tslots[i];
1233: yv[il] = xv[ir];
1234: yv[il+1] = xv[ir+1];
1235: yv[il+2] = xv[ir+2];
1236: yv[il+3] = xv[ir+3];
1237: }
1238: }
1239: } else if (addv == ADD_VALUES) {
1240: for (i=0; i<n; i++) {
1241: il = fslots[i]; ir = tslots[i];
1242: yv[il] += xv[ir];
1243: yv[il+1] += xv[ir+1];
1244: yv[il+2] += xv[ir+2];
1245: yv[il+3] += xv[ir+3];
1246: }
1247: #if !defined(PETSC_USE_COMPLEX)
1248: } else if (addv == MAX_VALUES) {
1249: for (i=0; i<n; i++) {
1250: il = fslots[i]; ir = tslots[i];
1251: yv[il] = PetscMax(yv[il],xv[ir]);
1252: yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1253: yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1254: yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1255: }
1256: #endif
1257: } else {SETERRQ(1,"Wrong insert option");}
1258: }
1259: VecRestoreArray(xin,&xv);
1260: if (xin != yin) {VecRestoreArray(yin,&yv);}
1261: return(0);
1262: }
1264: /* --------------------------------------------------------------------------------------*/
1266: int VecScatterEnd_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1267: {
1268: VecScatter_MPI_General *gen_to,*gen_from;
1269: Scalar *rvalues,*yv,*val;
1270: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1271: MPI_Request *rwaits,*swaits;
1272: MPI_Status *rstatus,*sstatus;
1275: if (mode & SCATTER_LOCAL) return(0);
1276: VecGetArray(yin,&yv);
1278: if (mode & SCATTER_REVERSE) {
1279: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1280: gen_from = (VecScatter_MPI_General*)ctx->todata;
1281: rwaits = gen_from->rev_requests;
1282: swaits = gen_to->rev_requests;
1283: sstatus = gen_from->sstatus;
1284: rstatus = gen_from->rstatus;
1285: } else {
1286: gen_to = (VecScatter_MPI_General*)ctx->todata;
1287: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1288: rwaits = gen_from->requests;
1289: swaits = gen_to->requests;
1290: sstatus = gen_to->sstatus;
1291: rstatus = gen_to->rstatus;
1292: }
1293: rvalues = gen_from->values;
1294: nrecvs = gen_from->n;
1295: nsends = gen_to->n;
1296: indices = gen_from->indices;
1297: rstarts = gen_from->starts;
1299: /* wait on receives */
1300: count = nrecvs;
1301: if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
1302: ierr = MPI_Waitall(nrecvs,rwaits,rstatus);
1303: n = rstarts[count];
1304: val = rvalues;
1305: lindices = indices;
1306: if (addv == INSERT_VALUES) {
1307: for (i=0; i<n; i++) {
1308: idx = lindices[i];
1309: yv[idx] = val[0];
1310: yv[idx+1] = val[1];
1311: yv[idx+2] = val[2];
1312: yv[idx+3] = val[3];
1313: val += 4;
1314: }
1315: } else if (addv == ADD_VALUES) {
1316: for (i=0; i<n; i++) {
1317: idx = lindices[i];
1318: yv[idx] += val[0];
1319: yv[idx+1] += val[1];
1320: yv[idx+2] += val[2];
1321: yv[idx+3] += val[3];
1322: val += 4;
1323: }
1324: #if !defined(PETSC_USE_COMPLEX)
1325: } else if (addv == MAX_VALUES) {
1326: for (i=0; i<n; i++) {
1327: idx = lindices[i];
1328: yv[idx] = PetscMax(yv[idx],val[0]);
1329: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1330: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1331: yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1332: val += 4;
1333: }
1334: #endif
1335: } else {SETERRQ(1,"Wrong insert option");}
1336: } else { /* unpack each message as it arrives, default version */
1337: while (count) {
1338: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
1339: /* unpack receives into our local space */
1340: val = rvalues + 4*rstarts[imdex];
1341: lindices = indices + rstarts[imdex];
1342: n = rstarts[imdex+1] - rstarts[imdex];
1343: if (addv == INSERT_VALUES) {
1344: for (i=0; i<n; i++) {
1345: idx = lindices[i];
1346: yv[idx] = val[0];
1347: yv[idx+1] = val[1];
1348: yv[idx+2] = val[2];
1349: yv[idx+3] = val[3];
1350: val += 4;
1351: }
1352: } else if (addv == ADD_VALUES) {
1353: for (i=0; i<n; i++) {
1354: idx = lindices[i];
1355: yv[idx] += val[0];
1356: yv[idx+1] += val[1];
1357: yv[idx+2] += val[2];
1358: yv[idx+3] += val[3];
1359: val += 4;
1360: }
1361: #if !defined(PETSC_USE_COMPLEX)
1362: } else if (addv == MAX_VALUES) {
1363: for (i=0; i<n; i++) {
1364: idx = lindices[i];
1365: yv[idx] = PetscMax(yv[idx],val[0]);
1366: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1367: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1368: yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1369: val += 4;
1370: }
1371: #endif
1372: } else {SETERRQ(1,"Wrong insert option");}
1373: count--;
1374: }
1375: }
1377: /* wait on sends */
1378: if (nsends) {
1379: MPI_Waitall(nsends,swaits,sstatus);
1380: }
1381: VecRestoreArray(yin,&yv);
1382: return(0);
1383: }
1385: /* --------------------------------------------------------------------------------------*/
1387: int VecScatterBegin_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1388: {
1389: VecScatter_MPI_General *gen_to,*gen_from;
1390: Scalar *xv,*yv,*val,*svalues;
1391: MPI_Request *rwaits,*swaits;
1392: int ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;
1395: VecGetArray(xin,&xv);
1396: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1398: if (mode & SCATTER_REVERSE) {
1399: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1400: gen_from = (VecScatter_MPI_General*)ctx->todata;
1401: rwaits = gen_from->rev_requests;
1402: swaits = gen_to->rev_requests;
1403: } else {
1404: gen_to = (VecScatter_MPI_General*)ctx->todata;
1405: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1406: rwaits = gen_from->requests;
1407: swaits = gen_to->requests;
1408: }
1409: svalues = gen_to->values;
1410: nrecvs = gen_from->n;
1411: nsends = gen_to->n;
1412: indices = gen_to->indices;
1413: sstarts = gen_to->starts;
1415: if (!(mode & SCATTER_LOCAL)) {
1417: if (gen_to->sendfirst) {
1418: /* this version packs and sends one at a time */
1419: val = svalues;
1420: for (i=0; i<nsends; i++) {
1421: iend = sstarts[i+1]-sstarts[i];
1423: for (j=0; j<iend; j++) {
1424: idx = *indices++;
1425: val[0] = xv[idx];
1426: val[1] = xv[idx+1];
1427: val[2] = xv[idx+2];
1428: val += 3;
1429: }
1430: MPI_Start_isend(3*iend,swaits+i);
1431: }
1432: }
1434: if (!gen_from->use_readyreceiver) {
1435: /* post receives since they were not posted in VecScatterPostRecvs() */
1436: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1437: }
1439: if (!gen_to->sendfirst) {
1440: /* this version packs all the messages together and sends */
1441: /*
1442: len = 3*sstarts[nsends];
1443: val = svalues;
1444: for (i=0; i<len; i += 3) {
1445: idx = *indices++;
1446: val[0] = xv[idx];
1447: val[1] = xv[idx+1];
1448: val[2] = xv[idx+2];
1449: val += 3;
1450: }
1451: MPI_Startall_isend(len,nsends,swaits);
1452: */
1454: /* this version packs and sends one at a time */
1455: val = svalues;
1456: for (i=0; i<nsends; i++) {
1457: iend = sstarts[i+1]-sstarts[i];
1459: for (j=0; j<iend; j++) {
1460: idx = *indices++;
1461: val[0] = xv[idx];
1462: val[1] = xv[idx+1];
1463: val[2] = xv[idx+2];
1464: val += 3;
1465: }
1466: MPI_Start_isend(3*iend,swaits+i);
1467: }
1468: }
1469: }
1471: /* take care of local scatters */
1472: if (gen_to->local.n) {
1473: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1474: int n = gen_to->local.n,il,ir;
1475: if (addv == INSERT_VALUES) {
1476: if (gen_to->local.is_copy) {
1477: PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1478: } else {
1479: for (i=0; i<n; i++) {
1480: il = fslots[i]; ir = tslots[i];
1481: yv[il] = xv[ir];
1482: yv[il+1] = xv[ir+1];
1483: yv[il+2] = xv[ir+2];
1484: yv[il+3] = xv[ir+3];
1485: }
1486: }
1487: } else if (addv == ADD_VALUES) {
1488: for (i=0; i<n; i++) {
1489: il = fslots[i]; ir = tslots[i];
1490: yv[il] += xv[ir];
1491: yv[il+1] += xv[ir+1];
1492: yv[il+2] += xv[ir+2];
1493: yv[il+3] += xv[ir+3];
1494: }
1495: #if !defined(PETSC_USE_COMPLEX)
1496: } else if (addv == MAX_VALUES) {
1497: for (i=0; i<n; i++) {
1498: il = fslots[i]; ir = tslots[i];
1499: yv[il] = PetscMax(yv[il],xv[ir]);
1500: yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1501: yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1502: yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1503: }
1504: #endif
1505: } else {SETERRQ(1,"Wrong insert option");}
1506: }
1507: VecRestoreArray(xin,&xv);
1508: if (xin != yin) {VecRestoreArray(yin,&yv);}
1509: return(0);
1510: }
1512: /* --------------------------------------------------------------------------------------*/
1514: int VecScatterEnd_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1515: {
1516: VecScatter_MPI_General *gen_to,*gen_from;
1517: Scalar *rvalues,*yv,*val;
1518: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1519: MPI_Request *rwaits,*swaits;
1520: MPI_Status rstatus,*sstatus;
1523: if (mode & SCATTER_LOCAL) return(0);
1524: VecGetArray(yin,&yv);
1526: if (mode & SCATTER_REVERSE) {
1527: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1528: gen_from = (VecScatter_MPI_General*)ctx->todata;
1529: rwaits = gen_from->rev_requests;
1530: swaits = gen_to->rev_requests;
1531: sstatus = gen_from->sstatus;
1532: } else {
1533: gen_to = (VecScatter_MPI_General*)ctx->todata;
1534: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1535: rwaits = gen_from->requests;
1536: swaits = gen_to->requests;
1537: sstatus = gen_to->sstatus;
1538: }
1539: rvalues = gen_from->values;
1540: nrecvs = gen_from->n;
1541: nsends = gen_to->n;
1542: indices = gen_from->indices;
1543: rstarts = gen_from->starts;
1545: /* wait on receives */
1546: count = nrecvs;
1547: while (count) {
1548: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1549: /* unpack receives into our local space */
1550: val = rvalues + 3*rstarts[imdex];
1551: lindices = indices + rstarts[imdex];
1552: n = rstarts[imdex+1] - rstarts[imdex];
1553: if (addv == INSERT_VALUES) {
1554: for (i=0; i<n; i++) {
1555: idx = lindices[i];
1556: yv[idx] = val[0];
1557: yv[idx+1] = val[1];
1558: yv[idx+2] = val[2];
1559: val += 3;
1560: }
1561: } else if (addv == ADD_VALUES) {
1562: for (i=0; i<n; i++) {
1563: idx = lindices[i];
1564: yv[idx] += val[0];
1565: yv[idx+1] += val[1];
1566: yv[idx+2] += val[2];
1567: val += 3;
1568: }
1569: #if !defined(PETSC_USE_COMPLEX)
1570: } else if (addv == MAX_VALUES) {
1571: for (i=0; i<n; i++) {
1572: idx = lindices[i];
1573: yv[idx] = PetscMax(yv[idx],val[0]);
1574: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1575: yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1576: val += 3;
1577: }
1578: #endif
1579: } else {SETERRQ(1,"Wrong insert option");}
1580: count--;
1581: }
1582: /* wait on sends */
1583: if (nsends) {
1584: MPI_Waitall(nsends,swaits,sstatus);
1585: }
1586: VecRestoreArray(yin,&yv);
1587: return(0);
1588: }
1590: /* --------------------------------------------------------------------------------------*/
1592: int VecScatterBegin_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1593: {
1594: VecScatter_MPI_General *gen_to,*gen_from;
1595: Scalar *xv,*yv,*val,*svalues;
1596: MPI_Request *rwaits,*swaits;
1597: int ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;
1600: VecGetArray(xin,&xv);
1601: if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1602: if (mode & SCATTER_REVERSE) {
1603: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1604: gen_from = (VecScatter_MPI_General*)ctx->todata;
1605: rwaits = gen_from->rev_requests;
1606: swaits = gen_to->rev_requests;
1607: } else {
1608: gen_to = (VecScatter_MPI_General*)ctx->todata;
1609: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1610: rwaits = gen_from->requests;
1611: swaits = gen_to->requests;
1612: }
1613: svalues = gen_to->values;
1614: nrecvs = gen_from->n;
1615: nsends = gen_to->n;
1616: indices = gen_to->indices;
1617: sstarts = gen_to->starts;
1619: if (!(mode & SCATTER_LOCAL)) {
1621: if (gen_to->sendfirst) {
1622: /* this version packs and sends one at a time */
1623: val = svalues;
1624: for (i=0; i<nsends; i++) {
1625: iend = sstarts[i+1]-sstarts[i];
1627: for (j=0; j<iend; j++) {
1628: idx = *indices++;
1629: val[0] = xv[idx];
1630: val[1] = xv[idx+1];
1631: val += 2;
1632: }
1633: MPI_Start_isend(2*iend,swaits+i);
1634: }
1635: }
1637: if (!gen_from->use_readyreceiver) {
1638: /* post receives since they were not posted in VecScatterPostRecvs() */
1639: MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1640: }
1642: if (!gen_to->sendfirst) {
1643: /* this version packs all the messages together and sends */
1644: /*
1645: len = 2*sstarts[nsends];
1646: val = svalues;
1647: for (i=0; i<len; i += 2) {
1648: idx = *indices++;
1649: val[0] = xv[idx];
1650: val[1] = xv[idx+1];
1651: val += 2;
1652: }
1653: MPI_Startall_isend(len,nsends,swaits);
1654: */
1656: /* this version packs and sends one at a time */
1657: val = svalues;
1658: for (i=0; i<nsends; i++) {
1659: iend = sstarts[i+1]-sstarts[i];
1661: for (j=0; j<iend; j++) {
1662: idx = *indices++;
1663: val[0] = xv[idx];
1664: val[1] = xv[idx+1];
1665: val += 2;
1666: }
1667: MPI_Start_isend(2*iend,swaits+i);
1668: }
1669: }
1670: }
1672: /* take care of local scatters */
1673: if (gen_to->local.n) {
1674: int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1675: int n = gen_to->local.n,il,ir;
1676: if (addv == INSERT_VALUES) {
1677: if (gen_to->local.is_copy) {
1678: PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1679: } else {
1680: for (i=0; i<n; i++) {
1681: il = fslots[i]; ir = tslots[i];
1682: yv[il] = xv[ir];
1683: yv[il+1] = xv[ir+1];
1684: }
1685: }
1686: } else if (addv == ADD_VALUES) {
1687: for (i=0; i<n; i++) {
1688: il = fslots[i]; ir = tslots[i];
1689: yv[il] += xv[ir];
1690: yv[il+1] += xv[ir+1];
1691: }
1692: #if !defined(PETSC_USE_COMPLEX)
1693: } else if (addv == MAX_VALUES) {
1694: for (i=0; i<n; i++) {
1695: il = fslots[i]; ir = tslots[i];
1696: yv[il] = PetscMax(yv[il],xv[ir]);
1697: yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1698: }
1699: #endif
1700: } else {SETERRQ(1,"Wrong insert option");}
1701: }
1702: VecRestoreArray(xin,&xv);
1703: if (xin != yin) {VecRestoreArray(yin,&yv);}
1704: return(0);
1705: }
1707: /* --------------------------------------------------------------------------------------*/
1709: int VecScatterEnd_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1710: {
1711: VecScatter_MPI_General *gen_to,*gen_from;
1712: Scalar *rvalues,*yv,*val;
1713: int ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1714: MPI_Request *rwaits,*swaits;
1715: MPI_Status rstatus,*sstatus;
1718: if (mode & SCATTER_LOCAL) return(0);
1719: VecGetArray(yin,&yv);
1721: if (mode & SCATTER_REVERSE) {
1722: gen_to = (VecScatter_MPI_General*)ctx->fromdata;
1723: gen_from = (VecScatter_MPI_General*)ctx->todata;
1724: rwaits = gen_from->rev_requests;
1725: swaits = gen_to->rev_requests;
1726: sstatus = gen_from->sstatus;
1727: } else {
1728: gen_to = (VecScatter_MPI_General*)ctx->todata;
1729: gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1730: rwaits = gen_from->requests;
1731: swaits = gen_to->requests;
1732: sstatus = gen_to->sstatus;
1733: }
1734: rvalues = gen_from->values;
1735: nrecvs = gen_from->n;
1736: nsends = gen_to->n;
1737: indices = gen_from->indices;
1738: rstarts = gen_from->starts;
1740: /* wait on receives */
1741: count = nrecvs;
1742: while (count) {
1743: MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1744: /* unpack receives into our local space */
1745: val = rvalues + 2*rstarts[imdex];
1746: lindices = indices + rstarts[imdex];
1747: n = rstarts[imdex+1] - rstarts[imdex];
1748: if (addv == INSERT_VALUES) {
1749: for (i=0; i<n; i++) {
1750: idx = lindices[i];
1751: yv[idx] = val[0];
1752: yv[idx+1] = val[1];
1753: val += 2;
1754: }
1755: } else if (addv == ADD_VALUES) {
1756: for (i=0; i<n; i++) {
1757: idx = lindices[i];
1758: yv[idx] += val[0];
1759: yv[idx+1] += val[1];
1760: val += 2;
1761: }
1762: #if !defined(PETSC_USE_COMPLEX)
1763: } else if (addv == MAX_VALUES) {
1764: for (i=0; i<n; i++) {
1765: idx = lindices[i];
1766: yv[idx] = PetscMax(yv[idx],val[0]);
1767: yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1768: val += 2;
1769: }
1770: #endif
1771: } else {SETERRQ(1,"Wrong insert option");}
1772: count--;
1773: }
1774: /* wait on sends */
1775: if (nsends) {
1776: MPI_Waitall(nsends,swaits,sstatus);
1777: }
1778: VecRestoreArray(yin,&yv);
1779: return(0);
1780: }
1782: /* ---------------------------------------------------------------------------------*/
1784: int VecScatterDestroy_PtoP_X(VecScatter ctx)
1785: {
1786: VecScatter_MPI_General *gen_to = (VecScatter_MPI_General*)ctx->todata;
1787: VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1788: int i,ierr;
1791: if (gen_to->use_readyreceiver) {
1792: /*
1793: Since we have already posted sends we must cancel them before freeing
1794: the requests
1795: */
1796: for (i=0; i<gen_from->n; i++) {
1797: MPI_Cancel(gen_from->requests+i);
1798: }
1799: }
1801: if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
1802: if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
1803: if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
1804: if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}
1806: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1807: /*
1808: IBM's PE version of MPI has a bug where freeing these guys will screw up later
1809: message passing.
1810: */
1811: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1812: for (i=0; i<gen_to->n; i++) {
1813: MPI_Request_free(gen_to->requests + i);
1814: MPI_Request_free(gen_to->rev_requests + i);
1815: }
1817: /*
1818: MPICH could not properly cancel requests thus with ready receiver mode we
1819: cannot free the requests. It may be fixed now, if not then put the following
1820: code inside a if !gen_to->use_readyreceiver) {
1821: */
1822: for (i=0; i<gen_from->n; i++) {
1823: MPI_Request_free(gen_from->requests + i);
1824: MPI_Request_free(gen_from->rev_requests + i);
1825: }
1826: #endif
1827:
1828: PetscFree(gen_to->sstatus);
1829: PetscFree(gen_to->values);
1830: PetscFree(gen_to->rev_requests);
1831: PetscFree(gen_to);
1832: PetscFree(gen_from->values);
1833: PetscFree(gen_from->rev_requests);
1834: PetscFree(gen_from);
1835: PetscLogObjectDestroy(ctx);
1836: PetscHeaderDestroy(ctx);
1837: return(0);
1838: }
1840: /* ==========================================================================================*/
1842: /* create parallel to sequential scatter context */
1843: /*
1844: bs indicates how many elements there are in each block. Normally
1845: this would be 1.
1846: */
1847: int VecScatterCreate_PtoS(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,int bs,VecScatter ctx)
1848: {
1849: VecScatter_MPI_General *from,*to;
1850: int *source,*lens,rank,*owners;
1851: int size,*lowner,*start,lengthy;
1852: int *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
1853: int *owner,*starts,count,tag,slen,ierr;
1854: int *rvalues,*svalues,base,imdex,nmax,*values,len,*indx,nprocslocal;
1855: MPI_Comm comm;
1856: MPI_Request *send_waits,*recv_waits;
1857: MPI_Status recv_status,*send_status;
1858: Map map;
1859: PetscTruth found;
1860:
1862: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1863: PetscObjectGetComm((PetscObject)xin,&comm);
1864: MPI_Comm_rank(comm,&rank);
1865: MPI_Comm_size(comm,&size);
1866: VecGetMap(xin,&map);
1867: MapGetGlobalRange(map,&owners);
1869: VecGetSize(yin,&lengthy);
1871: /* first count number of contributors to each processor */
1872: PetscMalloc(2*size*sizeof(int),&nprocs);
1873: ierr = PetscMemzero(nprocs,2*size*sizeof(int));
1874: procs = nprocs + size;
1875: PetscMalloc((nx+1)*sizeof(int),&owner);
1876: for (i=0; i<nx; i++) {
1877: idx = inidx[i];
1878: found = PETSC_FALSE;
1879: for (j=0; j<size; j++) {
1880: if (idx >= owners[j] && idx < owners[j+1]) {
1881: nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1882: }
1883: }
1884: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
1885: }
1886: nprocslocal = nprocs[rank];
1887: nprocs[rank] = procs[rank] = 0;
1888: nsends = 0; for (i=0; i<size; i++) { nsends += procs[i];}
1890: /* inform other processors of number of messages and max length*/
1891: PetscMalloc(2*size*sizeof(int),&work);
1892: ierr = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
1893: nmax = work[rank];
1894: nrecvs = work[size+rank];
1895: ierr = PetscFree(work);
1897: /* post receives: */
1898: PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
1899: PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
1900: for (i=0; i<nrecvs; i++) {
1901: MPI_Irecv((rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1902: }
1904: /* do sends:
1905: 1) starts[i] gives the starting index in svalues for stuff going to
1906: the ith processor
1907: */
1908: PetscMalloc((nx+1)*sizeof(int),&svalues);
1909: PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
1910: PetscMalloc((size+1)*sizeof(int),&starts);
1911: starts[0] = 0;
1912: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1913: for (i=0; i<nx; i++) {
1914: if (owner[i] != rank) {
1915: svalues[starts[owner[i]]++] = inidx[i];
1916: }
1917: }
1919: starts[0] = 0;
1920: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1921: count = 0;
1922: for (i=0; i<size; i++) {
1923: if (procs[i]) {
1924: MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1925: }
1926: }
1927: PetscFree(starts);
1929: base = owners[rank];
1931: /* wait on receives */
1932: ierr = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
1933: source = lens + nrecvs;
1934: count = nrecvs;
1935: slen = 0;
1936: while (count) {
1937: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1938: /* unpack receives into our local space */
1939: MPI_Get_count(&recv_status,MPI_INT,&n);
1940: source[imdex] = recv_status.MPI_SOURCE;
1941: lens[imdex] = n;
1942: slen += n;
1943: count--;
1944: }
1945: PetscFree(recv_waits);
1946:
1947: /* allocate entire send scatter context */
1948: PetscNew(VecScatter_MPI_General,&to);
1949: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1950: PetscMemzero(to,sizeof(VecScatter_MPI_General));
1951: PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1952: len = slen*sizeof(int) + bs*slen*sizeof(Scalar) + (nrecvs+1)*sizeof(int) +
1953: nrecvs*(sizeof(int) + sizeof(MPI_Request));
1954: to->n = nrecvs;
1955: PetscMalloc(len,&to->values);
1956: PetscLogObjectMemory(ctx,len);
1957: to->requests = (MPI_Request*)(to->values + bs*slen);
1958: to->indices = (int*)(to->requests + nrecvs);
1959: to->starts = (int*)(to->indices + slen);
1960: to->procs = (int*)(to->starts + nrecvs + 1);
1961: ierr = PetscMalloc(2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status),&to->sstatus);
1962: to->rstatus = to->sstatus + PetscMax(nrecvs,nsends) + 1;
1963: PetscLogObjectMemory(ctx,2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status));
1964: ctx->todata = (void*)to;
1965: to->starts[0] = 0;
1967: if (nrecvs) {
1968: PetscMalloc(nrecvs*sizeof(int),&indx);
1969: for (i=0; i<nrecvs; i++) indx[i] = i;
1970: PetscSortIntWithPermutation(nrecvs,source,indx);
1972: /* move the data into the send scatter */
1973: for (i=0; i<nrecvs; i++) {
1974: to->starts[i+1] = to->starts[i] + lens[indx[i]];
1975: to->procs[i] = source[indx[i]];
1976: values = rvalues + indx[i]*nmax;
1977: for (j=0; j<lens[indx[i]]; j++) {
1978: to->indices[to->starts[i] + j] = values[j] - base;
1979: }
1980: }
1981: PetscFree(indx);
1982: }
1983: PetscFree(rvalues);
1984: PetscFree(lens);
1985:
1986: /* allocate entire receive scatter context */
1987: PetscNew(VecScatter_MPI_General,&from);
1988: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1989: PetscMemzero(from,sizeof(VecScatter_MPI_General));
1990: PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1991: len = ny*sizeof(int) + ny*bs*sizeof(Scalar) + (nsends+1)*sizeof(int) +
1992: nsends*(sizeof(int) + sizeof(MPI_Request));
1993: from->n = nsends;
1994: PetscMalloc(len,&from->values);
1995: PetscLogObjectMemory(ctx,len);
1996: from->requests = (MPI_Request*)(from->values + bs*ny);
1997: from->indices = (int*)(from->requests + nsends);
1998: from->starts = (int*)(from->indices + ny);
1999: from->procs = (int*)(from->starts + nsends + 1);
2000: ctx->fromdata = (void*)from;
2002: /* move data into receive scatter */
2003: PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2004: start = lowner + size;
2005: count = 0; from->starts[0] = start[0] = 0;
2006: for (i=0; i<size; i++) {
2007: if (procs[i]) {
2008: lowner[i] = count;
2009: from->procs[count++] = i;
2010: from->starts[count] = start[count] = start[count-1] + nprocs[i];
2011: }
2012: }
2013: for (i=0; i<nx; i++) {
2014: if (owner[i] != rank) {
2015: from->indices[start[lowner[owner[i]]]++] = inidy[i];
2016: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2017: }
2018: }
2019: PetscFree(lowner);
2020: PetscFree(owner);
2021: PetscFree(nprocs);
2022:
2023: /* wait on sends */
2024: if (nsends) {
2025: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2026: MPI_Waitall(nsends,send_waits,send_status);
2027: PetscFree(send_status);
2028: }
2029: PetscFree(send_waits);
2030: PetscFree(svalues);
2032: if (nprocslocal) {
2033: int nt;
2034: /* we have a scatter to ourselves */
2035: from->local.n = to->local.n = nt = nprocslocal;
2036: PetscMalloc(nt*sizeof(int),&from->local.slots);
2037: PetscMalloc(nt*sizeof(int),&to->local.slots);
2038: PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2039: nt = 0;
2040: for (i=0; i<nx; i++) {
2041: idx = inidx[i];
2042: if (idx >= owners[rank] && idx < owners[rank+1]) {
2043: to->local.slots[nt] = idx - owners[rank];
2044: from->local.slots[nt++] = inidy[i];
2045: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2046: }
2047: }
2048: } else {
2049: from->local.n = 0;
2050: from->local.slots = 0;
2051: to->local.n = 0;
2052: to->local.slots = 0;
2053: }
2054: from->local.nonmatching_computed = PETSC_FALSE;
2055: from->local.n_nonmatching = 0;
2056: from->local.slots_nonmatching = 0;
2057: to->local.nonmatching_computed = PETSC_FALSE;
2058: to->local.n_nonmatching = 0;
2059: to->local.slots_nonmatching = 0;
2061: to->type = VEC_SCATTER_MPI_GENERAL;
2062: from->type = VEC_SCATTER_MPI_GENERAL;
2064: from->bs = bs;
2065: to->bs = bs;
2066: if (bs > 1) {
2067: PetscTruth flg,flgs = PETSC_FALSE;
2068: int *sstarts = to->starts, *rstarts = from->starts;
2069: int *sprocs = to->procs, *rprocs = from->procs;
2070: MPI_Request *swaits = to->requests,*rwaits = from->requests;
2071: MPI_Request *rev_swaits,*rev_rwaits;
2072: Scalar *Ssvalues = to->values, *Srvalues = from->values;
2074: ctx->destroy = VecScatterDestroy_PtoP_X;
2075: ctx->copy = VecScatterCopy_PtoP_X;
2076: ctx->view = VecScatterView_MPI;
2077:
2078: tag = ctx->tag;
2079: comm = ctx->comm;
2081: /* allocate additional wait variables for the "reverse" scatter */
2082: PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&rev_rwaits);
2083: to->rev_requests = rev_rwaits;
2084: PetscMalloc((nsends+1)*sizeof(MPI_Request),&rev_swaits);
2085: from->rev_requests = rev_swaits;
2086: PetscLogObjectMemory(ctx,(nsends+nrecvs+2)*sizeof(MPI_Request));
2088: /* Register the receives that you will use later (sends for scatter reverse) */
2089: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flgs);
2090: if (flgs) {
2091: PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter Ssend moden");
2092: }
2093: for (i=0; i<from->n; i++) {
2094: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2095: comm,rwaits+i);
2096: if (!flgs) {
2097: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2098: comm,rev_swaits+i);
2099: } else {
2100: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2101: comm,rev_swaits+i);
2102: }
2103: }
2105: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
2106: if (flg) {
2107: ctx->postrecvs = VecScatterPostRecvs_PtoP_X;
2108: to->use_readyreceiver = PETSC_TRUE;
2109: from->use_readyreceiver = PETSC_TRUE;
2110: for (i=0; i<to->n; i++) {
2111: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2112: comm,swaits+i);
2113: }
2114: PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter ready receiver moden");
2115: } else {
2116: ctx->postrecvs = 0;
2117: to->use_readyreceiver = PETSC_FALSE;
2118: from->use_readyreceiver = PETSC_FALSE;
2119: for (i=0; i<to->n; i++) {
2120: if (!flgs) {
2121: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2122: comm,swaits+i);
2123: } else {
2124: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2125: comm,swaits+i);
2126: }
2127: }
2128: }
2129: /* Register receives for scatter reverse */
2130: for (i=0; i<to->n; i++) {
2131: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2132: comm,rev_rwaits+i);
2133: }
2135: PetscLogInfo(0,"VecScatterCreate_PtoS:Using blocksize %d scattern",bs);
2136: switch (bs) {
2137: case 12:
2138: ctx->begin = VecScatterBegin_PtoP_12;
2139: ctx->end = VecScatterEnd_PtoP_12;
2140: break;
2141: case 5:
2142: ctx->begin = VecScatterBegin_PtoP_5;
2143: ctx->end = VecScatterEnd_PtoP_5;
2144: break;
2145: case 4:
2146: ctx->begin = VecScatterBegin_PtoP_4;
2147: ctx->end = VecScatterEnd_PtoP_4;
2148: break;
2149: case 3:
2150: ctx->begin = VecScatterBegin_PtoP_3;
2151: ctx->end = VecScatterEnd_PtoP_3;
2152: break;
2153: case 2:
2154: ctx->begin = VecScatterBegin_PtoP_2;
2155: ctx->end = VecScatterEnd_PtoP_2;
2156: break;
2157: default:
2158: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2159: }
2160: } else {
2161: ctx->postrecvs = 0;
2162: ctx->destroy = VecScatterDestroy_PtoP;
2163: ctx->begin = VecScatterBegin_PtoP;
2164: ctx->end = VecScatterEnd_PtoP;
2165: ctx->copy = VecScatterCopy_PtoP;
2166: ctx->view = VecScatterView_MPI;
2167: }
2169: /* Check if the local scatter is actually a copy; important special case */
2170: if (nprocslocal) {
2171: VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
2172: }
2174: return(0);
2175: }
2177: /* ------------------------------------------------------------------------------------*/
2178: /*
2179: Scatter from local Seq vectors to a parallel vector.
2180: */
2181: int VecScatterCreate_StoP(int nx,int *inidx,int ny,int *inidy,Vec yin,VecScatter ctx)
2182: {
2183: Vec_MPI *y = (Vec_MPI *)yin->data;
2184: VecScatter_MPI_General *from,*to;
2185: int *source,nprocslocal,*lens,rank = y->rank,*owners = yin->map->range;
2186: int ierr,size = y->size,*lowner,*start;
2187: int *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
2188: int *owner,*starts,count,tag,slen;
2189: int *rvalues,*svalues,base,imdex,nmax,*values,len;
2190: PetscTruth found;
2191: MPI_Comm comm = yin->comm;
2192: MPI_Request *send_waits,*recv_waits;
2193: MPI_Status recv_status,*send_status;
2196: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2197: /* first count number of contributors to each processor */
2198: PetscMalloc(2*size*sizeof(int),&nprocs);
2199: ierr = PetscMemzero(nprocs,2*size*sizeof(int));
2200: procs = nprocs + size;
2201: PetscMalloc((nx+1)*sizeof(int),&owner);
2202: for (i=0; i<nx; i++) {
2203: idx = inidy[i];
2204: found = PETSC_FALSE;
2205: for (j=0; j<size; j++) {
2206: if (idx >= owners[j] && idx < owners[j+1]) {
2207: nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2208: }
2209: }
2210: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2211: }
2212: nprocslocal = nprocs[rank];
2213: nprocs[rank] = procs[rank] = 0;
2214: nsends = 0; for (i=0; i<size; i++) { nsends += procs[i];}
2216: /* inform other processors of number of messages and max length*/
2217: ierr = PetscMalloc(2*size*sizeof(int),&work);
2218: ierr = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2219: nmax = work[rank];
2220: nrecvs = work[size+rank];
2221: ierr = PetscFree(work);
2223: /* post receives: */
2224: PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2225: PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2226: for (i=0; i<nrecvs; i++) {
2227: MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2228: }
2230: /* do sends:
2231: 1) starts[i] gives the starting index in svalues for stuff going to
2232: the ith processor
2233: */
2234: PetscMalloc((nx+1)*sizeof(int),&svalues);
2235: PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2236: PetscMalloc((size+1)*sizeof(int),&starts);
2237: starts[0] = 0;
2238: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2239: for (i=0; i<nx; i++) {
2240: if (owner[i] != rank) {
2241: svalues[starts[owner[i]]++] = inidy[i];
2242: }
2243: }
2245: starts[0] = 0;
2246: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2247: count = 0;
2248: for (i=0; i<size; i++) {
2249: if (procs[i]) {
2250: MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2251: count++;
2252: }
2253: }
2254: PetscFree(starts);
2256: /* allocate entire send scatter context */
2257: PetscNew(VecScatter_MPI_General,&to);
2258: PetscMemzero(to,sizeof(VecScatter_MPI_General));
2259: PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2260: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
2261: len = ny*(sizeof(int) + sizeof(Scalar)) + (nsends+1)*sizeof(int) +
2262: nsends*(sizeof(int) + sizeof(MPI_Request));
2263: to->n = nsends;
2264: PetscMalloc(len,&to->values);
2265: PetscLogObjectMemory(ctx,len);
2266: to->requests = (MPI_Request*)(to->values + ny);
2267: to->indices = (int*)(to->requests + nsends);
2268: to->starts = (int*)(to->indices + ny);
2269: to->procs = (int*)(to->starts + nsends + 1);
2270: ierr = PetscMalloc((PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status),&to->sstatus);
2271: to->rstatus = to->sstatus + PetscMax(nsends,nrecvs) + 1;
2272: PetscLogObjectMemory(ctx,2*(PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status));
2273: ctx->todata = (void*)to;
2275: /* move data into send scatter context */
2276: PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2277: start = lowner + size;
2278: count = 0;
2279: to->starts[0] = start[0] = 0;
2280: for (i=0; i<size; i++) {
2281: if (procs[i]) {
2282: lowner[i] = count;
2283: to->procs[count++] = i;
2284: to->starts[count] = start[count] = start[count-1] + nprocs[i];
2285: }
2286: }
2287: for (i=0; i<nx; i++) {
2288: if (owner[i] != rank) {
2289: to->indices[start[lowner[owner[i]]]++] = inidx[i];
2290: }
2291: }
2292: PetscFree(lowner);
2293: PetscFree(owner);
2294: PetscFree(nprocs);
2296: base = owners[rank];
2298: /* wait on receives */
2299: ierr = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2300: source = lens + nrecvs;
2301: count = nrecvs; slen = 0;
2302: while (count) {
2303: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2304: /* unpack receives into our local space */
2305: MPI_Get_count(&recv_status,MPI_INT,&n);
2306: source[imdex] = recv_status.MPI_SOURCE;
2307: lens[imdex] = n;
2308: slen += n;
2309: count--;
2310: }
2311: PetscFree(recv_waits);
2312:
2313: /* allocate entire receive scatter context */
2314: PetscNew(VecScatter_MPI_General,&from);
2315: PetscMemzero(from,sizeof(VecScatter_MPI_General));
2316: PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2317: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
2318: len = slen*(sizeof(int) + sizeof(Scalar)) + (nrecvs+1)*sizeof(int) +
2319: nrecvs*(sizeof(int) + sizeof(MPI_Request));
2320: from->n = nrecvs;
2321: PetscMalloc(len,&from->values);
2322: PetscLogObjectMemory(ctx,len);
2323: from->requests = (MPI_Request*)(from->values + slen);
2324: from->indices = (int*)(from->requests + nrecvs);
2325: from->starts = (int*)(from->indices + slen);
2326: from->procs = (int*)(from->starts + nrecvs + 1);
2327: ctx->fromdata = (void*)from;
2329: /* move the data into the receive scatter context*/
2330: from->starts[0] = 0;
2331: for (i=0; i<nrecvs; i++) {
2332: from->starts[i+1] = from->starts[i] + lens[i];
2333: from->procs[i] = source[i];
2334: values = rvalues + i*nmax;
2335: for (j=0; j<lens[i]; j++) {
2336: from->indices[from->starts[i] + j] = values[j] - base;
2337: }
2338: }
2339: PetscFree(rvalues);
2340: PetscFree(lens);
2341:
2342: /* wait on sends */
2343: if (nsends) {
2344: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2345: MPI_Waitall(nsends,send_waits,send_status);
2346: PetscFree(send_status);
2347: }
2348: PetscFree(send_waits);
2349: PetscFree(svalues);
2351: if (nprocslocal) {
2352: int nt;
2353: /* we have a scatter to ourselves */
2354: from->local.n = to->local.n = nt = nprocslocal;
2355: PetscMalloc(nt*sizeof(int),&from->local.slots);
2356: PetscMalloc(nt*sizeof(int),&to->local.slots);
2357: PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2358: nt = 0;
2359: for (i=0; i<ny; i++) {
2360: idx = inidy[i];
2361: if (idx >= owners[rank] && idx < owners[rank+1]) {
2362: from->local.slots[nt] = idx - owners[rank];
2363: to->local.slots[nt++] = inidx[i];
2364: }
2365: }
2366: } else {
2367: from->local.n = 0;
2368: from->local.slots = 0;
2369: to->local.n = 0;
2370: to->local.slots = 0;
2372: }
2373: from->local.nonmatching_computed = PETSC_FALSE;
2374: from->local.n_nonmatching = 0;
2375: from->local.slots_nonmatching = 0;
2376: to->local.nonmatching_computed = PETSC_FALSE;
2377: to->local.n_nonmatching = 0;
2378: to->local.slots_nonmatching = 0;
2380: to->type = VEC_SCATTER_MPI_GENERAL;
2381: from->type = VEC_SCATTER_MPI_GENERAL;
2383: ctx->destroy = VecScatterDestroy_PtoP;
2384: ctx->postrecvs = 0;
2385: ctx->begin = VecScatterBegin_PtoP;
2386: ctx->end = VecScatterEnd_PtoP;
2387: ctx->copy = 0;
2388: ctx->view = VecScatterView_MPI;
2390: to->bs = 1;
2391: from->bs = 1;
2392: return(0);
2393: }
2395: /* ---------------------------------------------------------------------------------*/
2396: int VecScatterCreate_PtoP(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,VecScatter ctx)
2397: {
2398: int *lens,rank,*owners = xin->map->range,size;
2399: int *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work,*local_inidx,*local_inidy;
2400: int *owner,*starts,count,tag,slen,ierr;
2401: int *rvalues,*svalues,base,imdex,nmax,*values;
2402: MPI_Comm comm;
2403: MPI_Request *send_waits,*recv_waits;
2404: MPI_Status recv_status;
2405: PetscTruth duplicate = PETSC_FALSE,found;
2408: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2409: PetscObjectGetComm((PetscObject)xin,&comm);
2410: MPI_Comm_size(comm,&size);
2411: MPI_Comm_rank(comm,&rank);
2412: if (size == 1) {
2413: VecScatterCreate_StoP(nx,inidx,ny,inidy,yin,ctx);
2414: return(0);
2415: }
2417: /*
2418: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2419: They then call the StoPScatterCreate()
2420: */
2421: /* first count number of contributors to each processor */
2422: ierr = PetscMalloc(2*size*sizeof(int),&nprocs);
2423: procs = nprocs + size;
2424: ierr = PetscMemzero(nprocs,2*size*sizeof(int));
2425: ierr = PetscMalloc((nx+1)*sizeof(int),&owner);
2426: for (i=0; i<nx; i++) {
2427: idx = inidx[i];
2428: found = PETSC_FALSE;
2429: for (j=0; j<size; j++) {
2430: if (idx >= owners[j] && idx < owners[j+1]) {
2431: nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2432: }
2433: }
2434: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2435: }
2436: nsends = 0; for (i=0; i<size; i++) { nsends += procs[i];}
2438: /* inform other processors of number of messages and max length*/
2439: PetscMalloc(2*size*sizeof(int),&work);
2440: ierr = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2441: nmax = work[rank];
2442: nrecvs = work[size+rank];
2443: ierr = PetscFree(work);
2445: /* post receives: */
2446: PetscMalloc(2*(nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2447: PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2448: for (i=0; i<nrecvs; i++) {
2449: MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2450: }
2452: /* do sends:
2453: 1) starts[i] gives the starting index in svalues for stuff going to
2454: the ith processor
2455: */
2456: PetscMalloc(2*(nx+1)*sizeof(int),&svalues);
2457: PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2458: PetscMalloc((size+1)*sizeof(int),&starts);
2459: starts[0] = 0;
2460: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2461: for (i=0; i<nx; i++) {
2462: svalues[2*starts[owner[i]]] = inidx[i];
2463: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2464: }
2465: PetscFree(owner);
2467: starts[0] = 0;
2468: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2469: count = 0;
2470: for (i=0; i<size; i++) {
2471: if (procs[i]) {
2472: MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2473: count++;
2474: }
2475: }
2476: PetscFree(starts);
2477: PetscFree(nprocs);
2479: base = owners[rank];
2481: /* wait on receives */
2482: PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2483: count = nrecvs;
2484: slen = 0;
2485: while (count) {
2486: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2487: /* unpack receives into our local space */
2488: MPI_Get_count(&recv_status,MPI_INT,&n);
2489: lens[imdex] = n/2;
2490: slen += n/2;
2491: count--;
2492: }
2493: PetscFree(recv_waits);
2494:
2495: PetscMalloc(2*(slen+1)*sizeof(int),&local_inidx);
2496: local_inidy = local_inidx + slen;
2498: count = 0;
2499: for (i=0; i<nrecvs; i++) {
2500: values = rvalues + 2*i*nmax;
2501: for (j=0; j<lens[i]; j++) {
2502: local_inidx[count] = values[2*j] - base;
2503: local_inidy[count++] = values[2*j+1];
2504: }
2505: }
2506: PetscFree(rvalues);
2507: PetscFree(lens);
2509: /* wait on sends */
2510: if (nsends) {
2511: MPI_Status *send_status;
2512: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2513: MPI_Waitall(nsends,send_waits,send_status);
2514: PetscFree(send_status);
2515: }
2516: PetscFree(send_waits);
2517: PetscFree(svalues);
2519: /*
2520: should sort and remove duplicates from local_inidx,local_inidy
2521: */
2523: #if defined(do_it_slow)
2524: /* sort on the from index */
2525: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2526: start = 0;
2527: while (start < slen) {
2528: count = start+1;
2529: last = local_inidx[start];
2530: while (count < slen && last == local_inidx[count]) count++;
2531: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2532: /* sort on to index */
2533: PetscSortInt(count-start,local_inidy+start);
2534: }
2535: /* remove duplicates; not most efficient way, but probably good enough */
2536: i = start;
2537: while (i < count-1) {
2538: if (local_inidy[i] != local_inidy[i+1]) {
2539: i++;
2540: } else { /* found a duplicate */
2541: duplicate = PETSC_TRUE;
2542: for (j=i; j<slen-1; j++) {
2543: local_inidx[j] = local_inidx[j+1];
2544: local_inidy[j] = local_inidy[j+1];
2545: }
2546: slen--;
2547: count--;
2548: /* printf("found dup %d %dn",local_inidx[i],local_inidy[i]);*/
2549: }
2550: }
2551: start = count;
2552: }
2553: #endif
2554: if (duplicate) {
2555: PetscLogInfo(0,"VecScatterCreate_PtoP:Duplicate to from indices passed in VecScatterCreate(), they are ignoredn");
2556: }
2557: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,yin,ctx);
2558: PetscFree(local_inidx);
2559: return(0);
2560: }