Actual source code: vpscat.c
1: #define PETSCVEC_DLL
3: /*
4: Defines parallel vector scatters.
5: */
7: #include include/private/isimpl.h
8: #include include/private/vecimpl.h
9: #include src/vec/vec/impls/dvecimpl.h
10: #include src/vec/vec/impls/mpi/pvecimpl.h
11: #include petscsys.h
15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
16: {
17: VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
18: VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
19: PetscErrorCode ierr;
20: PetscInt i;
21: PetscMPIInt rank;
22: PetscViewerFormat format;
23: PetscTruth iascii;
26: PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
27: if (iascii) {
28: MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
29: PetscViewerGetFormat(viewer,&format);
30: if (format == PETSC_VIEWER_ASCII_INFO) {
31: PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
33: MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
34: MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
35: itmp = to->starts[to->n+1];
36: MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
37: itmp = from->starts[from->n+1];
38: MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
39: MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);
41: PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
42: PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);
43: PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);
44: PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
45: PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
46: PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));
48: } else {
49: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
50: if (to->n) {
51: for (i=0; i<to->n; i++){
52: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
53: }
54: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
55: for (i=0; i<to->starts[to->n]; i++){
56: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
57: }
58: }
60: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
61: if (from->n) {
62: for (i=0; i<from->n; i++){
63: PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
64: }
66: PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
67: for (i=0; i<from->starts[from->n]; i++){
68: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
69: }
70: }
71: if (to->local.n) {
72: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
73: for (i=0; i<to->local.n; i++){
74: PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
75: }
76: }
78: PetscViewerFlush(viewer);
79: }
80: } else {
81: SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
82: }
83: return(0);
84: }
86: /* -----------------------------------------------------------------------------------*/
87: /*
88: The next routine determines what part of the local part of the scatter is an
89: exact copy of values into their current location. We check this here and
90: then know that we need not perform that portion of the scatter when the vector is
91: scattering to itself with INSERT_VALUES.
93: This is currently not used but would speed up, for example DALocalToLocalBegin/End()
95: */
98: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
99: {
100: PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
102: PetscInt *nto_slots,*nfrom_slots,j = 0;
103:
105: for (i=0; i<n; i++) {
106: if (to_slots[i] != from_slots[i]) n_nonmatching++;
107: }
109: if (!n_nonmatching) {
110: to->nonmatching_computed = PETSC_TRUE;
111: to->n_nonmatching = from->n_nonmatching = 0;
112: PetscInfo1(scatter,"Reduced %D to 0\n", n);
113: } else if (n_nonmatching == n) {
114: to->nonmatching_computed = PETSC_FALSE;
115: PetscInfo(scatter,"All values non-matching\n");
116: } else {
117: to->nonmatching_computed= PETSC_TRUE;
118: to->n_nonmatching = from->n_nonmatching = n_nonmatching;
119: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
120: PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
121: to->slots_nonmatching = nto_slots;
122: from->slots_nonmatching = nfrom_slots;
123: for (i=0; i<n; i++) {
124: if (to_slots[i] != from_slots[i]) {
125: nto_slots[j] = to_slots[i];
126: nfrom_slots[j] = from_slots[i];
127: j++;
128: }
129: }
130: PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
131: }
132: return(0);
133: }
135: /* --------------------------------------------------------------------------------------*/
137: /* -------------------------------------------------------------------------------------*/
140: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
141: {
142: VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
143: VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
144: PetscErrorCode ierr;
145: PetscInt i;
148: if (to->use_readyreceiver) {
149: /*
150: Since we have already posted sends we must cancel them before freeing
151: the requests
152: */
153: for (i=0; i<from->n; i++) {
154: MPI_Cancel(from->requests+i);
155: }
156: for (i=0; i<to->n; i++) {
157: MPI_Cancel(to->rev_requests+i);
158: }
159: }
161: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
162: if (to->use_alltoallw) {
163: PetscFree3(to->wcounts,to->wdispls,to->types);
164: PetscFree3(from->wcounts,from->wdispls,from->types);
165: }
166: #endif
168: #if defined(PETSC_HAVE_MPI_WINDOW)
169: if (to->use_window) {
170: MPI_Win_free(&from->window);
171: MPI_Win_free(&to->window);
172: }
173: #endif
175: if (to->use_alltoallv) {
176: PetscFree2(to->counts,to->displs);
177: PetscFree2(from->counts,from->displs);
178: }
180: /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
181: /*
182: IBM's PE version of MPI has a bug where freeing these guys will screw up later
183: message passing.
184: */
185: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
186: if (!to->use_alltoallv) { /* currently the to->requests etc are ALWAYS allocated even if not used */
187: if (to->requests) {
188: for (i=0; i<to->n; i++) {
189: MPI_Request_free(to->requests + i);
190: }
191: }
192: if (to->rev_requests) {
193: for (i=0; i<to->n; i++) {
194: MPI_Request_free(to->rev_requests + i);
195: }
196: }
197: }
198: /*
199: MPICH could not properly cancel requests thus with ready receiver mode we
200: cannot free the requests. It may be fixed now, if not then put the following
201: code inside a if !to->use_readyreceiver) {
202: */
203: if (!to->use_alltoallv) { /* currently the from->requests etc are ALWAYS allocated even if not used */
204: if (from->requests) {
205: for (i=0; i<from->n; i++) {
206: MPI_Request_free(from->requests + i);
207: }
208: }
210: if (from->rev_requests) {
211: for (i=0; i<from->n; i++) {
212: MPI_Request_free(from->rev_requests + i);
213: }
214: }
215: }
216: #endif
218: PetscFree(to->local.vslots);
219: PetscFree(from->local.vslots);
220: PetscFree2(to->counts,to->displs);
221: PetscFree2(from->counts,from->displs);
222: PetscFree(to->local.slots_nonmatching);
223: PetscFree(from->local.slots_nonmatching);
224: PetscFree(to->rev_requests);
225: PetscFree(from->rev_requests);
226: PetscFree(to->requests);
227: PetscFree(from->requests);
228: PetscFree4(to->values,to->indices,to->starts,to->procs);
229: PetscFree2(to->sstatus,to->rstatus);
230: PetscFree4(from->values,from->indices,from->starts,from->procs);
231: PetscFree(from);
232: PetscFree(to);
233: PetscHeaderDestroy(ctx);
234: return(0);
235: }
239: /* --------------------------------------------------------------------------------------*/
240: /*
241: Special optimization to see if the local part of the scatter is actually
242: a copy. The scatter routines call PetscMemcpy() instead.
243:
244: */
247: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
248: {
249: PetscInt n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
250: PetscInt to_start,from_start;
254: to_start = to_slots[0];
255: from_start = from_slots[0];
257: for (i=1; i<n; i++) {
258: to_start += bs;
259: from_start += bs;
260: if (to_slots[i] != to_start) return(0);
261: if (from_slots[i] != from_start) return(0);
262: }
263: to->is_copy = PETSC_TRUE;
264: to->copy_start = to_slots[0];
265: to->copy_length = bs*sizeof(PetscScalar)*n;
266: from->is_copy = PETSC_TRUE;
267: from->copy_start = from_slots[0];
268: from->copy_length = bs*sizeof(PetscScalar)*n;
269: PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
270: return(0);
271: }
273: /* --------------------------------------------------------------------------------------*/
277: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
278: {
279: VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
280: VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
281: PetscErrorCode ierr;
282: PetscInt ny,bs = in_from->bs;
285: out->begin = in->begin;
286: out->end = in->end;
287: out->copy = in->copy;
288: out->destroy = in->destroy;
289: out->view = in->view;
291: /* allocate entire send scatter context */
292: PetscNewLog(out,VecScatter_MPI_General,&out_to);
293: PetscNewLog(out,VecScatter_MPI_General,&out_from);
295: ny = in_to->starts[in_to->n];
296: out_to->n = in_to->n;
297: out_to->type = in_to->type;
298: out_to->sendfirst = in_to->sendfirst;
300: PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
301: PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
302: out_to->n,PetscMPIInt,&out_to->procs);
303: PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
304: &out_to->rstatus);
305: PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
306: PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
307: PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
308:
309: out->todata = (void*)out_to;
310: out_to->local.n = in_to->local.n;
311: out_to->local.nonmatching_computed = PETSC_FALSE;
312: out_to->local.n_nonmatching = 0;
313: out_to->local.slots_nonmatching = 0;
314: if (in_to->local.n) {
315: PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
316: PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
317: PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
318: PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
319: } else {
320: out_to->local.vslots = 0;
321: out_from->local.vslots = 0;
322: }
324: /* allocate entire receive context */
325: out_from->type = in_from->type;
326: ny = in_from->starts[in_from->n];
327: out_from->n = in_from->n;
328: out_from->sendfirst = in_from->sendfirst;
330: PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
331: PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
332: out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
333: PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
334: PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
335: PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
336: out->fromdata = (void*)out_from;
337: out_from->local.n = in_from->local.n;
338: out_from->local.nonmatching_computed = PETSC_FALSE;
339: out_from->local.n_nonmatching = 0;
340: out_from->local.slots_nonmatching = 0;
342: /*
343: set up the request arrays for use with isend_init() and irecv_init()
344: */
345: {
346: PetscMPIInt tag;
347: MPI_Comm comm;
348: PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
349: PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
350: PetscInt i;
351: PetscTruth flg;
352: MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
353: MPI_Request *rev_swaits,*rev_rwaits;
354: PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
356: PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
357: PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);
359: rev_rwaits = out_to->rev_requests;
360: rev_swaits = out_from->rev_requests;
362: out_from->bs = out_to->bs = bs;
363: tag = ((PetscObject)out)->tag;
364: comm = ((PetscObject)out)->comm;
366: /* Register the receives that you will use later (sends for scatter reverse) */
367: for (i=0; i<out_from->n; i++) {
368: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
369: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
370: }
372: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
373: if (flg) {
374: out_to->use_readyreceiver = PETSC_TRUE;
375: out_from->use_readyreceiver = PETSC_TRUE;
376: for (i=0; i<out_to->n; i++) {
377: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
378: }
379: if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
380: MPI_Barrier(comm);
381: PetscInfo(in,"Using VecScatter ready receiver mode\n");
382: } else {
383: out_to->use_readyreceiver = PETSC_FALSE;
384: out_from->use_readyreceiver = PETSC_FALSE;
385: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
386: if (flg) {
387: PetscInfo(in,"Using VecScatter Ssend mode\n");
388: }
389: for (i=0; i<out_to->n; i++) {
390: if (!flg) {
391: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
392: } else {
393: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
394: }
395: }
396: }
397: /* Register receives for scatter reverse */
398: for (i=0; i<out_to->n; i++) {
399: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
400: }
401: }
403: return(0);
404: }
405: /* --------------------------------------------------------------------------------------------------
406: Packs and unpacks the message data into send or from receive buffers.
408: These could be generated automatically.
410: Fortran kernels etc. could be used.
411: */
412: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
413: {
414: PetscInt i;
415: for (i=0; i<n; i++) {
416: y[i] = x[indicesx[i]];
417: }
418: }
419: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
420: {
421: PetscInt i;
422: switch (addv) {
423: case INSERT_VALUES:
424: for (i=0; i<n; i++) {
425: y[indicesy[i]] = x[i];
426: }
427: break;
428: case ADD_VALUES:
429: for (i=0; i<n; i++) {
430: y[indicesy[i]] += x[i];
431: }
432: break;
433: #if !defined(PETSC_USE_COMPLEX)
434: case MAX_VALUES:
435: for (i=0; i<n; i++) {
436: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
437: }
438: #else
439: case MAX_VALUES:
440: #endif
441: case NOT_SET_VALUES:
442: break;
443: }
444: }
446: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
447: {
448: PetscInt i;
449: switch (addv) {
450: case INSERT_VALUES:
451: for (i=0; i<n; i++) {
452: y[indicesy[i]] = x[indicesx[i]];
453: }
454: break;
455: case ADD_VALUES:
456: for (i=0; i<n; i++) {
457: y[indicesy[i]] += x[indicesx[i]];
458: }
459: break;
460: #if !defined(PETSC_USE_COMPLEX)
461: case MAX_VALUES:
462: for (i=0; i<n; i++) {
463: y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
464: }
465: #else
466: case MAX_VALUES:
467: #endif
468: case NOT_SET_VALUES:
469: break;
470: }
471: }
473: /* ----------------------------------------------------------------------------------------------- */
474: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
475: {
476: PetscInt i,idx;
478: for (i=0; i<n; i++) {
479: idx = *indicesx++;
480: y[0] = x[idx];
481: y[1] = x[idx+1];
482: y += 2;
483: }
484: }
485: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
486: {
487: PetscInt i,idy;
489: switch (addv) {
490: case INSERT_VALUES:
491: for (i=0; i<n; i++) {
492: idy = *indicesy++;
493: y[idy] = x[0];
494: y[idy+1] = x[1];
495: x += 2;
496: }
497: break;
498: case ADD_VALUES:
499: for (i=0; i<n; i++) {
500: idy = *indicesy++;
501: y[idy] += x[0];
502: y[idy+1] += x[1];
503: x += 2;
504: }
505: break;
506: #if !defined(PETSC_USE_COMPLEX)
507: case MAX_VALUES:
508: for (i=0; i<n; i++) {
509: idy = *indicesy++;
510: y[idy] = PetscMax(y[idy],x[0]);
511: y[idy+1] = PetscMax(y[idy+1],x[1]);
512: x += 2;
513: }
514: #else
515: case MAX_VALUES:
516: #endif
517: case NOT_SET_VALUES:
518: break;
519: }
520: }
522: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
523: {
524: PetscInt i,idx,idy;
526: switch (addv) {
527: case INSERT_VALUES:
528: for (i=0; i<n; i++) {
529: idx = *indicesx++;
530: idy = *indicesy++;
531: y[idy] = x[idx];
532: y[idy+1] = x[idx+1];
533: }
534: break;
535: case ADD_VALUES:
536: for (i=0; i<n; i++) {
537: idx = *indicesx++;
538: idy = *indicesy++;
539: y[idy] += x[idx];
540: y[idy+1] += x[idx+1];
541: }
542: break;
543: #if !defined(PETSC_USE_COMPLEX)
544: case MAX_VALUES:
545: for (i=0; i<n; i++) {
546: idx = *indicesx++;
547: idy = *indicesy++;
548: y[idy] = PetscMax(y[idy],x[idx]);
549: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
550: }
551: #else
552: case MAX_VALUES:
553: #endif
554: case NOT_SET_VALUES:
555: break;
556: }
557: }
558: /* ----------------------------------------------------------------------------------------------- */
559: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
560: {
561: PetscInt i,idx;
563: for (i=0; i<n; i++) {
564: idx = *indicesx++;
565: y[0] = x[idx];
566: y[1] = x[idx+1];
567: y[2] = x[idx+2];
568: y += 3;
569: }
570: }
571: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
572: {
573: PetscInt i,idy;
575: switch (addv) {
576: case INSERT_VALUES:
577: for (i=0; i<n; i++) {
578: idy = *indicesy++;
579: y[idy] = x[0];
580: y[idy+1] = x[1];
581: y[idy+2] = x[2];
582: x += 3;
583: }
584: break;
585: case ADD_VALUES:
586: for (i=0; i<n; i++) {
587: idy = *indicesy++;
588: y[idy] += x[0];
589: y[idy+1] += x[1];
590: y[idy+2] += x[2];
591: x += 3;
592: }
593: break;
594: #if !defined(PETSC_USE_COMPLEX)
595: case MAX_VALUES:
596: for (i=0; i<n; i++) {
597: idy = *indicesy++;
598: y[idy] = PetscMax(y[idy],x[0]);
599: y[idy+1] = PetscMax(y[idy+1],x[1]);
600: y[idy+2] = PetscMax(y[idy+2],x[2]);
601: x += 3;
602: }
603: #else
604: case MAX_VALUES:
605: #endif
606: case NOT_SET_VALUES:
607: break;
608: }
609: }
611: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
612: {
613: PetscInt i,idx,idy;
615: switch (addv) {
616: case INSERT_VALUES:
617: for (i=0; i<n; i++) {
618: idx = *indicesx++;
619: idy = *indicesy++;
620: y[idy] = x[idx];
621: y[idy+1] = x[idx+1];
622: y[idy+2] = x[idx+2];
623: }
624: break;
625: case ADD_VALUES:
626: for (i=0; i<n; i++) {
627: idx = *indicesx++;
628: idy = *indicesy++;
629: y[idy] += x[idx];
630: y[idy+1] += x[idx+1];
631: y[idy+2] += x[idx+2];
632: }
633: break;
634: #if !defined(PETSC_USE_COMPLEX)
635: case MAX_VALUES:
636: for (i=0; i<n; i++) {
637: idx = *indicesx++;
638: idy = *indicesy++;
639: y[idy] = PetscMax(y[idy],x[idx]);
640: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
641: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
642: }
643: #else
644: case MAX_VALUES:
645: #endif
646: case NOT_SET_VALUES:
647: break;
648: }
649: }
650: /* ----------------------------------------------------------------------------------------------- */
651: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
652: {
653: PetscInt i,idx;
655: for (i=0; i<n; i++) {
656: idx = *indicesx++;
657: y[0] = x[idx];
658: y[1] = x[idx+1];
659: y[2] = x[idx+2];
660: y[3] = x[idx+3];
661: y += 4;
662: }
663: }
664: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
665: {
666: PetscInt i,idy;
668: switch (addv) {
669: case INSERT_VALUES:
670: for (i=0; i<n; i++) {
671: idy = *indicesy++;
672: y[idy] = x[0];
673: y[idy+1] = x[1];
674: y[idy+2] = x[2];
675: y[idy+3] = x[3];
676: x += 4;
677: }
678: break;
679: case ADD_VALUES:
680: for (i=0; i<n; i++) {
681: idy = *indicesy++;
682: y[idy] += x[0];
683: y[idy+1] += x[1];
684: y[idy+2] += x[2];
685: y[idy+3] += x[3];
686: x += 4;
687: }
688: break;
689: #if !defined(PETSC_USE_COMPLEX)
690: case MAX_VALUES:
691: for (i=0; i<n; i++) {
692: idy = *indicesy++;
693: y[idy] = PetscMax(y[idy],x[0]);
694: y[idy+1] = PetscMax(y[idy+1],x[1]);
695: y[idy+2] = PetscMax(y[idy+2],x[2]);
696: y[idy+3] = PetscMax(y[idy+3],x[3]);
697: x += 4;
698: }
699: #else
700: case MAX_VALUES:
701: #endif
702: case NOT_SET_VALUES:
703: break;
704: }
705: }
707: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
708: {
709: PetscInt i,idx,idy;
711: switch (addv) {
712: case INSERT_VALUES:
713: for (i=0; i<n; i++) {
714: idx = *indicesx++;
715: idy = *indicesy++;
716: y[idy] = x[idx];
717: y[idy+1] = x[idx+1];
718: y[idy+2] = x[idx+2];
719: y[idy+3] = x[idx+3];
720: }
721: break;
722: case ADD_VALUES:
723: for (i=0; i<n; i++) {
724: idx = *indicesx++;
725: idy = *indicesy++;
726: y[idy] += x[idx];
727: y[idy+1] += x[idx+1];
728: y[idy+2] += x[idx+2];
729: y[idy+3] += x[idx+3];
730: }
731: break;
732: #if !defined(PETSC_USE_COMPLEX)
733: case MAX_VALUES:
734: for (i=0; i<n; i++) {
735: idx = *indicesx++;
736: idy = *indicesy++;
737: y[idy] = PetscMax(y[idy],x[idx]);
738: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
739: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
740: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
741: }
742: #else
743: case MAX_VALUES:
744: #endif
745: case NOT_SET_VALUES:
746: break;
747: }
748: }
749: /* ----------------------------------------------------------------------------------------------- */
750: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
751: {
752: PetscInt i,idx;
754: for (i=0; i<n; i++) {
755: idx = *indicesx++;
756: y[0] = x[idx];
757: y[1] = x[idx+1];
758: y[2] = x[idx+2];
759: y[3] = x[idx+3];
760: y[4] = x[idx+4];
761: y += 5;
762: }
763: }
764: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
765: {
766: PetscInt i,idy;
768: switch (addv) {
769: case INSERT_VALUES:
770: for (i=0; i<n; i++) {
771: idy = *indicesy++;
772: y[idy] = x[0];
773: y[idy+1] = x[1];
774: y[idy+2] = x[2];
775: y[idy+3] = x[3];
776: y[idy+4] = x[4];
777: x += 5;
778: }
779: break;
780: case ADD_VALUES:
781: for (i=0; i<n; i++) {
782: idy = *indicesy++;
783: y[idy] += x[0];
784: y[idy+1] += x[1];
785: y[idy+2] += x[2];
786: y[idy+3] += x[3];
787: y[idy+4] += x[4];
788: x += 5;
789: }
790: break;
791: #if !defined(PETSC_USE_COMPLEX)
792: case MAX_VALUES:
793: for (i=0; i<n; i++) {
794: idy = *indicesy++;
795: y[idy] = PetscMax(y[idy],x[0]);
796: y[idy+1] = PetscMax(y[idy+1],x[1]);
797: y[idy+2] = PetscMax(y[idy+2],x[2]);
798: y[idy+3] = PetscMax(y[idy+3],x[3]);
799: y[idy+4] = PetscMax(y[idy+4],x[4]);
800: x += 5;
801: }
802: #else
803: case MAX_VALUES:
804: #endif
805: case NOT_SET_VALUES:
806: break;
807: }
808: }
810: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
811: {
812: PetscInt i,idx,idy;
814: switch (addv) {
815: case INSERT_VALUES:
816: for (i=0; i<n; i++) {
817: idx = *indicesx++;
818: idy = *indicesy++;
819: y[idy] = x[idx];
820: y[idy+1] = x[idx+1];
821: y[idy+2] = x[idx+2];
822: y[idy+3] = x[idx+3];
823: y[idy+4] = x[idx+4];
824: }
825: break;
826: case ADD_VALUES:
827: for (i=0; i<n; i++) {
828: idx = *indicesx++;
829: idy = *indicesy++;
830: y[idy] += x[idx];
831: y[idy+1] += x[idx+1];
832: y[idy+2] += x[idx+2];
833: y[idy+3] += x[idx+3];
834: y[idy+4] += x[idx+4];
835: }
836: break;
837: #if !defined(PETSC_USE_COMPLEX)
838: case MAX_VALUES:
839: for (i=0; i<n; i++) {
840: idx = *indicesx++;
841: idy = *indicesy++;
842: y[idy] = PetscMax(y[idy],x[idx]);
843: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
844: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
845: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
846: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
847: }
848: #else
849: case MAX_VALUES:
850: #endif
851: case NOT_SET_VALUES:
852: break;
853: }
854: }
855: /* ----------------------------------------------------------------------------------------------- */
856: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
857: {
858: PetscInt i,idx;
860: for (i=0; i<n; i++) {
861: idx = *indicesx++;
862: y[0] = x[idx];
863: y[1] = x[idx+1];
864: y[2] = x[idx+2];
865: y[3] = x[idx+3];
866: y[4] = x[idx+4];
867: y[5] = x[idx+5];
868: y += 6;
869: }
870: }
871: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
872: {
873: PetscInt i,idy;
875: switch (addv) {
876: case INSERT_VALUES:
877: for (i=0; i<n; i++) {
878: idy = *indicesy++;
879: y[idy] = x[0];
880: y[idy+1] = x[1];
881: y[idy+2] = x[2];
882: y[idy+3] = x[3];
883: y[idy+4] = x[4];
884: y[idy+5] = x[5];
885: x += 6;
886: }
887: break;
888: case ADD_VALUES:
889: for (i=0; i<n; i++) {
890: idy = *indicesy++;
891: y[idy] += x[0];
892: y[idy+1] += x[1];
893: y[idy+2] += x[2];
894: y[idy+3] += x[3];
895: y[idy+4] += x[4];
896: y[idy+5] += x[5];
897: x += 6;
898: }
899: break;
900: #if !defined(PETSC_USE_COMPLEX)
901: case MAX_VALUES:
902: for (i=0; i<n; i++) {
903: idy = *indicesy++;
904: y[idy] = PetscMax(y[idy],x[0]);
905: y[idy+1] = PetscMax(y[idy+1],x[1]);
906: y[idy+2] = PetscMax(y[idy+2],x[2]);
907: y[idy+3] = PetscMax(y[idy+3],x[3]);
908: y[idy+4] = PetscMax(y[idy+4],x[4]);
909: y[idy+5] = PetscMax(y[idy+5],x[5]);
910: x += 6;
911: }
912: #else
913: case MAX_VALUES:
914: #endif
915: case NOT_SET_VALUES:
916: break;
917: }
918: }
920: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
921: {
922: PetscInt i,idx,idy;
924: switch (addv) {
925: case INSERT_VALUES:
926: for (i=0; i<n; i++) {
927: idx = *indicesx++;
928: idy = *indicesy++;
929: y[idy] = x[idx];
930: y[idy+1] = x[idx+1];
931: y[idy+2] = x[idx+2];
932: y[idy+3] = x[idx+3];
933: y[idy+4] = x[idx+4];
934: y[idy+5] = x[idx+5];
935: }
936: break;
937: case ADD_VALUES:
938: for (i=0; i<n; i++) {
939: idx = *indicesx++;
940: idy = *indicesy++;
941: y[idy] += x[idx];
942: y[idy+1] += x[idx+1];
943: y[idy+2] += x[idx+2];
944: y[idy+3] += x[idx+3];
945: y[idy+4] += x[idx+4];
946: y[idy+5] += x[idx+5];
947: }
948: break;
949: #if !defined(PETSC_USE_COMPLEX)
950: case MAX_VALUES:
951: for (i=0; i<n; i++) {
952: idx = *indicesx++;
953: idy = *indicesy++;
954: y[idy] = PetscMax(y[idy],x[idx]);
955: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
956: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
957: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
958: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
959: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
960: }
961: #else
962: case MAX_VALUES:
963: #endif
964: case NOT_SET_VALUES:
965: break;
966: }
967: }
968: /* ----------------------------------------------------------------------------------------------- */
969: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
970: {
971: PetscInt i,idx;
973: for (i=0; i<n; i++) {
974: idx = *indicesx++;
975: y[0] = x[idx];
976: y[1] = x[idx+1];
977: y[2] = x[idx+2];
978: y[3] = x[idx+3];
979: y[4] = x[idx+4];
980: y[5] = x[idx+5];
981: y[6] = x[idx+6];
982: y += 7;
983: }
984: }
985: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
986: {
987: PetscInt i,idy;
989: switch (addv) {
990: case INSERT_VALUES:
991: for (i=0; i<n; i++) {
992: idy = *indicesy++;
993: y[idy] = x[0];
994: y[idy+1] = x[1];
995: y[idy+2] = x[2];
996: y[idy+3] = x[3];
997: y[idy+4] = x[4];
998: y[idy+5] = x[5];
999: y[idy+6] = x[6];
1000: x += 7;
1001: }
1002: break;
1003: case ADD_VALUES:
1004: for (i=0; i<n; i++) {
1005: idy = *indicesy++;
1006: y[idy] += x[0];
1007: y[idy+1] += x[1];
1008: y[idy+2] += x[2];
1009: y[idy+3] += x[3];
1010: y[idy+4] += x[4];
1011: y[idy+5] += x[5];
1012: y[idy+6] += x[6];
1013: x += 7;
1014: }
1015: break;
1016: #if !defined(PETSC_USE_COMPLEX)
1017: case MAX_VALUES:
1018: for (i=0; i<n; i++) {
1019: idy = *indicesy++;
1020: y[idy] = PetscMax(y[idy],x[0]);
1021: y[idy+1] = PetscMax(y[idy+1],x[1]);
1022: y[idy+2] = PetscMax(y[idy+2],x[2]);
1023: y[idy+3] = PetscMax(y[idy+3],x[3]);
1024: y[idy+4] = PetscMax(y[idy+4],x[4]);
1025: y[idy+5] = PetscMax(y[idy+5],x[5]);
1026: y[idy+6] = PetscMax(y[idy+6],x[6]);
1027: x += 7;
1028: }
1029: #else
1030: case MAX_VALUES:
1031: #endif
1032: case NOT_SET_VALUES:
1033: break;
1034: }
1035: }
1037: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1038: {
1039: PetscInt i,idx,idy;
1041: switch (addv) {
1042: case INSERT_VALUES:
1043: for (i=0; i<n; i++) {
1044: idx = *indicesx++;
1045: idy = *indicesy++;
1046: y[idy] = x[idx];
1047: y[idy+1] = x[idx+1];
1048: y[idy+2] = x[idx+2];
1049: y[idy+3] = x[idx+3];
1050: y[idy+4] = x[idx+4];
1051: y[idy+5] = x[idx+5];
1052: y[idy+6] = x[idx+6];
1053: }
1054: break;
1055: case ADD_VALUES:
1056: for (i=0; i<n; i++) {
1057: idx = *indicesx++;
1058: idy = *indicesy++;
1059: y[idy] += x[idx];
1060: y[idy+1] += x[idx+1];
1061: y[idy+2] += x[idx+2];
1062: y[idy+3] += x[idx+3];
1063: y[idy+4] += x[idx+4];
1064: y[idy+5] += x[idx+5];
1065: y[idy+6] += x[idx+6];
1066: }
1067: break;
1068: #if !defined(PETSC_USE_COMPLEX)
1069: case MAX_VALUES:
1070: for (i=0; i<n; i++) {
1071: idx = *indicesx++;
1072: idy = *indicesy++;
1073: y[idy] = PetscMax(y[idy],x[idx]);
1074: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1075: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1076: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1077: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1078: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1079: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1080: }
1081: #else
1082: case MAX_VALUES:
1083: #endif
1084: case NOT_SET_VALUES:
1085: break;
1086: }
1087: }
1088: /* ----------------------------------------------------------------------------------------------- */
1089: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1090: {
1091: PetscInt i,idx;
1093: for (i=0; i<n; i++) {
1094: idx = *indicesx++;
1095: y[0] = x[idx];
1096: y[1] = x[idx+1];
1097: y[2] = x[idx+2];
1098: y[3] = x[idx+3];
1099: y[4] = x[idx+4];
1100: y[5] = x[idx+5];
1101: y[6] = x[idx+6];
1102: y[7] = x[idx+7];
1103: y += 8;
1104: }
1105: }
1106: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1107: {
1108: PetscInt i,idy;
1110: switch (addv) {
1111: case INSERT_VALUES:
1112: for (i=0; i<n; i++) {
1113: idy = *indicesy++;
1114: y[idy] = x[0];
1115: y[idy+1] = x[1];
1116: y[idy+2] = x[2];
1117: y[idy+3] = x[3];
1118: y[idy+4] = x[4];
1119: y[idy+5] = x[5];
1120: y[idy+6] = x[6];
1121: y[idy+7] = x[7];
1122: x += 8;
1123: }
1124: break;
1125: case ADD_VALUES:
1126: for (i=0; i<n; i++) {
1127: idy = *indicesy++;
1128: y[idy] += x[0];
1129: y[idy+1] += x[1];
1130: y[idy+2] += x[2];
1131: y[idy+3] += x[3];
1132: y[idy+4] += x[4];
1133: y[idy+5] += x[5];
1134: y[idy+6] += x[6];
1135: y[idy+7] += x[7];
1136: x += 8;
1137: }
1138: break;
1139: #if !defined(PETSC_USE_COMPLEX)
1140: case MAX_VALUES:
1141: for (i=0; i<n; i++) {
1142: idy = *indicesy++;
1143: y[idy] = PetscMax(y[idy],x[0]);
1144: y[idy+1] = PetscMax(y[idy+1],x[1]);
1145: y[idy+2] = PetscMax(y[idy+2],x[2]);
1146: y[idy+3] = PetscMax(y[idy+3],x[3]);
1147: y[idy+4] = PetscMax(y[idy+4],x[4]);
1148: y[idy+5] = PetscMax(y[idy+5],x[5]);
1149: y[idy+6] = PetscMax(y[idy+6],x[6]);
1150: y[idy+7] = PetscMax(y[idy+7],x[7]);
1151: x += 8;
1152: }
1153: #else
1154: case MAX_VALUES:
1155: #endif
1156: case NOT_SET_VALUES:
1157: break;
1158: }
1159: }
1161: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1162: {
1163: PetscInt i,idx,idy;
1165: switch (addv) {
1166: case INSERT_VALUES:
1167: for (i=0; i<n; i++) {
1168: idx = *indicesx++;
1169: idy = *indicesy++;
1170: y[idy] = x[idx];
1171: y[idy+1] = x[idx+1];
1172: y[idy+2] = x[idx+2];
1173: y[idy+3] = x[idx+3];
1174: y[idy+4] = x[idx+4];
1175: y[idy+5] = x[idx+5];
1176: y[idy+6] = x[idx+6];
1177: y[idy+7] = x[idx+7];
1178: }
1179: break;
1180: case ADD_VALUES:
1181: for (i=0; i<n; i++) {
1182: idx = *indicesx++;
1183: idy = *indicesy++;
1184: y[idy] += x[idx];
1185: y[idy+1] += x[idx+1];
1186: y[idy+2] += x[idx+2];
1187: y[idy+3] += x[idx+3];
1188: y[idy+4] += x[idx+4];
1189: y[idy+5] += x[idx+5];
1190: y[idy+6] += x[idx+6];
1191: y[idy+7] += x[idx+7];
1192: }
1193: break;
1194: #if !defined(PETSC_USE_COMPLEX)
1195: case MAX_VALUES:
1196: for (i=0; i<n; i++) {
1197: idx = *indicesx++;
1198: idy = *indicesy++;
1199: y[idy] = PetscMax(y[idy],x[idx]);
1200: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1201: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1202: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1203: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1204: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1205: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1206: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1207: }
1208: #else
1209: case MAX_VALUES:
1210: #endif
1211: case NOT_SET_VALUES:
1212: break;
1213: }
1214: }
1216: /* ----------------------------------------------------------------------------------------------- */
1217: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1218: {
1219: PetscInt i,idx;
1221: for (i=0; i<n; i++) {
1222: idx = *indicesx++;
1223: y[0] = x[idx];
1224: y[1] = x[idx+1];
1225: y[2] = x[idx+2];
1226: y[3] = x[idx+3];
1227: y[4] = x[idx+4];
1228: y[5] = x[idx+5];
1229: y[6] = x[idx+6];
1230: y[7] = x[idx+7];
1231: y[8] = x[idx+8];
1232: y[9] = x[idx+9];
1233: y[10] = x[idx+10];
1234: y[11] = x[idx+11];
1235: y += 12;
1236: }
1237: }
1238: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1239: {
1240: PetscInt i,idy;
1242: switch (addv) {
1243: case INSERT_VALUES:
1244: for (i=0; i<n; i++) {
1245: idy = *indicesy++;
1246: y[idy] = x[0];
1247: y[idy+1] = x[1];
1248: y[idy+2] = x[2];
1249: y[idy+3] = x[3];
1250: y[idy+4] = x[4];
1251: y[idy+5] = x[5];
1252: y[idy+6] = x[6];
1253: y[idy+7] = x[7];
1254: y[idy+8] = x[8];
1255: y[idy+9] = x[9];
1256: y[idy+10] = x[10];
1257: y[idy+11] = x[11];
1258: x += 12;
1259: }
1260: break;
1261: case ADD_VALUES:
1262: for (i=0; i<n; i++) {
1263: idy = *indicesy++;
1264: y[idy] += x[0];
1265: y[idy+1] += x[1];
1266: y[idy+2] += x[2];
1267: y[idy+3] += x[3];
1268: y[idy+4] += x[4];
1269: y[idy+5] += x[5];
1270: y[idy+6] += x[6];
1271: y[idy+7] += x[7];
1272: y[idy+8] += x[8];
1273: y[idy+9] += x[9];
1274: y[idy+10] += x[10];
1275: y[idy+11] += x[11];
1276: x += 12;
1277: }
1278: break;
1279: #if !defined(PETSC_USE_COMPLEX)
1280: case MAX_VALUES:
1281: for (i=0; i<n; i++) {
1282: idy = *indicesy++;
1283: y[idy] = PetscMax(y[idy],x[0]);
1284: y[idy+1] = PetscMax(y[idy+1],x[1]);
1285: y[idy+2] = PetscMax(y[idy+2],x[2]);
1286: y[idy+3] = PetscMax(y[idy+3],x[3]);
1287: y[idy+4] = PetscMax(y[idy+4],x[4]);
1288: y[idy+5] = PetscMax(y[idy+5],x[5]);
1289: y[idy+6] = PetscMax(y[idy+6],x[6]);
1290: y[idy+7] = PetscMax(y[idy+7],x[7]);
1291: y[idy+8] = PetscMax(y[idy+8],x[8]);
1292: y[idy+9] = PetscMax(y[idy+9],x[9]);
1293: y[idy+10] = PetscMax(y[idy+10],x[10]);
1294: y[idy+11] = PetscMax(y[idy+11],x[11]);
1295: x += 12;
1296: }
1297: #else
1298: case MAX_VALUES:
1299: #endif
1300: case NOT_SET_VALUES:
1301: break;
1302: }
1303: }
1305: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1306: {
1307: PetscInt i,idx,idy;
1309: switch (addv) {
1310: case INSERT_VALUES:
1311: for (i=0; i<n; i++) {
1312: idx = *indicesx++;
1313: idy = *indicesy++;
1314: y[idy] = x[idx];
1315: y[idy+1] = x[idx+1];
1316: y[idy+2] = x[idx+2];
1317: y[idy+3] = x[idx+3];
1318: y[idy+4] = x[idx+4];
1319: y[idy+5] = x[idx+5];
1320: y[idy+6] = x[idx+6];
1321: y[idy+7] = x[idx+7];
1322: y[idy+8] = x[idx+8];
1323: y[idy+9] = x[idx+9];
1324: y[idy+10] = x[idx+10];
1325: y[idy+11] = x[idx+11];
1326: }
1327: break;
1328: case ADD_VALUES:
1329: for (i=0; i<n; i++) {
1330: idx = *indicesx++;
1331: idy = *indicesy++;
1332: y[idy] += x[idx];
1333: y[idy+1] += x[idx+1];
1334: y[idy+2] += x[idx+2];
1335: y[idy+3] += x[idx+3];
1336: y[idy+4] += x[idx+4];
1337: y[idy+5] += x[idx+5];
1338: y[idy+6] += x[idx+6];
1339: y[idy+7] += x[idx+7];
1340: y[idy+8] += x[idx+8];
1341: y[idy+9] += x[idx+9];
1342: y[idy+10] += x[idx+10];
1343: y[idy+11] += x[idx+11];
1344: }
1345: break;
1346: #if !defined(PETSC_USE_COMPLEX)
1347: case MAX_VALUES:
1348: for (i=0; i<n; i++) {
1349: idx = *indicesx++;
1350: idy = *indicesy++;
1351: y[idy] = PetscMax(y[idy],x[idx]);
1352: y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1353: y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1354: y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1355: y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1356: y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1357: y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1358: y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1359: y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1360: y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1361: y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1362: y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1363: }
1364: #else
1365: case MAX_VALUES:
1366: #endif
1367: case NOT_SET_VALUES:
1368: break;
1369: }
1370: }
1372: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1373: #define BS 1
1374: #include src/vec/vec/utils/vpscat.h
1375: #define BS 2
1376: #include src/vec/vec/utils/vpscat.h
1377: #define BS 3
1378: #include src/vec/vec/utils/vpscat.h
1379: #define BS 4
1380: #include src/vec/vec/utils/vpscat.h
1381: #define BS 5
1382: #include src/vec/vec/utils/vpscat.h
1383: #define BS 6
1384: #include src/vec/vec/utils/vpscat.h
1385: #define BS 7
1386: #include src/vec/vec/utils/vpscat.h
1387: #define BS 8
1388: #include src/vec/vec/utils/vpscat.h
1389: #define BS 12
1390: #include src/vec/vec/utils/vpscat.h
1392: /* ==========================================================================================*/
1394: /* create parallel to sequential scatter context */
1396: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);
1400: /*@
1401: VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send a receive.
1403: Collective on VecScatter
1405: Input Parameters:
1406: + VecScatter - obtained with VecScatterCreateEmpty()
1407: . nsends -
1408: . sendSizes -
1409: . sendProcs -
1410: . sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1411: . nrecvs - number of receives to expect
1412: . recvSizes -
1413: . recvProcs - processes who are sending to me
1414: . recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1415: - bs - size of block
1417: Notes: sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1418: to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.
1420: Probably does not handle sends to self properly. It should remove those from the counts that are used
1421: in allocating space inside of the from struct
1423: Level: intermediate
1425: @*/
1426: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1427: {
1428: VecScatter_MPI_General *from, *to;
1429: PetscInt sendSize, recvSize;
1430: PetscInt n, i;
1431: PetscErrorCode ierr;
1433: /* allocate entire send scatter context */
1434: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1435: to->n = nsends;
1436: for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1437: PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1438: PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1439: PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1440: to->starts[0] = 0;
1441: for(n = 0; n < to->n; n++) {
1442: if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1443: to->starts[n+1] = to->starts[n] + sendSizes[n];
1444: to->procs[n] = sendProcs[n];
1445: for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1446: to->indices[i] = sendIdx[i];
1447: }
1448: }
1449: ctx->todata = (void *) to;
1451: /* allocate entire receive scatter context */
1452: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1453: from->n = nrecvs;
1454: for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1455: PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1456: PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1457: from->starts[0] = 0;
1458: for(n = 0; n < from->n; n++) {
1459: if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1460: from->starts[n+1] = from->starts[n] + recvSizes[n];
1461: from->procs[n] = recvProcs[n];
1462: for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1463: from->indices[i] = recvIdx[i];
1464: }
1465: }
1466: ctx->fromdata = (void *)from;
1468: /* No local scatter optimization */
1469: from->local.n = 0;
1470: from->local.vslots = 0;
1471: to->local.n = 0;
1472: to->local.vslots = 0;
1473: from->local.nonmatching_computed = PETSC_FALSE;
1474: from->local.n_nonmatching = 0;
1475: from->local.slots_nonmatching = 0;
1476: to->local.nonmatching_computed = PETSC_FALSE;
1477: to->local.n_nonmatching = 0;
1478: to->local.slots_nonmatching = 0;
1480: from->type = VEC_SCATTER_MPI_GENERAL;
1481: to->type = VEC_SCATTER_MPI_GENERAL;
1482: from->bs = bs;
1483: to->bs = bs;
1485: VecScatterCreateCommon_PtoS(from, to, ctx);
1486: return(0);
1487: }
1489: /*
1490: bs indicates how many elements there are in each block. Normally this would be 1.
1491: */
1494: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1495: {
1496: VecScatter_MPI_General *from,*to;
1497: PetscMPIInt size,rank,imdex,tag,n;
1498: PetscInt *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1499: PetscInt *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1500: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1501: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1502: PetscInt *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1503: MPI_Comm comm;
1504: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1505: MPI_Status recv_status,*send_status;
1506: PetscErrorCode ierr;
1509: PetscObjectGetNewTag((PetscObject)ctx,&tag);
1510: PetscObjectGetComm((PetscObject)xin,&comm);
1511: MPI_Comm_rank(comm,&rank);
1512: MPI_Comm_size(comm,&size);
1513: owners = xin->map.range;
1514: VecGetSize(yin,&lengthy);
1515: VecGetSize(xin,&lengthx);
1517: /* first count number of contributors to each processor */
1518: PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1519: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1520: j = 0;
1521: nsends = 0;
1522: for (i=0; i<nx; i++) {
1523: idx = inidx[i];
1524: if (idx < owners[j]) j = 0;
1525: for (; j<size; j++) {
1526: if (idx < owners[j+1]) {
1527: if (!nprocs[2*j]++) nsends++;
1528: nprocs[2*j+1] = 1;
1529: owner[i] = j;
1530: break;
1531: }
1532: }
1533: }
1534: nprocslocal = nprocs[2*rank];
1535: nprocs[2*rank] = nprocs[2*rank+1] = 0;
1536: if (nprocslocal) nsends--;
1538: /* inform other processors of number of messages and max length*/
1539: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
1541: /* post receives: */
1542: PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1543: for (i=0; i<nrecvs; i++) {
1544: MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1545: }
1547: /* do sends:
1548: 1) starts[i] gives the starting index in svalues for stuff going to
1549: the ith processor
1550: */
1551: PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1552: starts[0] = 0;
1553: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1554: for (i=0; i<nx; i++) {
1555: if (owner[i] != rank) {
1556: svalues[starts[owner[i]]++] = inidx[i];
1557: }
1558: }
1560: starts[0] = 0;
1561: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1562: count = 0;
1563: for (i=0; i<size; i++) {
1564: if (nprocs[2*i+1]) {
1565: MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1566: }
1567: }
1569: /* wait on receives */
1570: count = nrecvs;
1571: slen = 0;
1572: while (count) {
1573: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1574: /* unpack receives into our local space */
1575: MPI_Get_count(&recv_status,MPIU_INT,&n);
1576: source[imdex] = recv_status.MPI_SOURCE;
1577: lens[imdex] = n;
1578: slen += n;
1579: count--;
1580: }
1581:
1582: /* allocate entire send scatter context */
1583: PetscNewLog(ctx,VecScatter_MPI_General,&to);
1584: to->n = nrecvs;
1585: PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1586: PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1587: nrecvs,PetscMPIInt,&to->procs);
1588: PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1589: &to->rstatus);
1590: ctx->todata = (void*)to;
1591: to->starts[0] = 0;
1593: if (nrecvs) {
1594: PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1595: for (i=0; i<nrecvs; i++) indx[i] = i;
1596: PetscSortIntWithPermutation(nrecvs,source,indx);
1598: /* move the data into the send scatter */
1599: base = owners[rank];
1600: for (i=0; i<nrecvs; i++) {
1601: to->starts[i+1] = to->starts[i] + lens[indx[i]];
1602: to->procs[i] = source[indx[i]];
1603: values = rvalues + indx[i]*nmax;
1604: for (j=0; j<lens[indx[i]]; j++) {
1605: to->indices[to->starts[i] + j] = values[j] - base;
1606: }
1607: }
1608: PetscFree(indx);
1609: }
1610: PetscFree4(rvalues,lens,source,recv_waits);
1611:
1612: /* allocate entire receive scatter context */
1613: PetscNewLog(ctx,VecScatter_MPI_General,&from);
1614: from->n = nsends;
1616: PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1617: PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,
1618: nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1619: ctx->fromdata = (void*)from;
1621: /* move data into receive scatter */
1622: PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1623: count = 0; from->starts[0] = start[0] = 0;
1624: for (i=0; i<size; i++) {
1625: if (nprocs[2*i+1]) {
1626: lowner[i] = count;
1627: from->procs[count++] = i;
1628: from->starts[count] = start[count] = start[count-1] + nprocs[2*i];
1629: }
1630: }
1631: for (i=0; i<nx; i++) {
1632: if (owner[i] != rank) {
1633: from->indices[start[lowner[owner[i]]]++] = inidy[i];
1634: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1635: }
1636: }
1637: PetscFree2(lowner,start);
1638: PetscFree2(nprocs,owner);
1639:
1640: /* wait on sends */
1641: if (nsends) {
1642: PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1643: MPI_Waitall(nsends,send_waits,send_status);
1644: PetscFree(send_status);
1645: }
1646: PetscFree3(svalues,send_waits,starts);
1648: if (nprocslocal) {
1649: PetscInt nt = from->local.n = to->local.n = nprocslocal;
1650: /* we have a scatter to ourselves */
1651: PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1652: PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1653: nt = 0;
1654: for (i=0; i<nx; i++) {
1655: idx = inidx[i];
1656: if (idx >= owners[rank] && idx < owners[rank+1]) {
1657: to->local.vslots[nt] = idx - owners[rank];
1658: from->local.vslots[nt++] = inidy[i];
1659: if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1660: }
1661: }
1662: } else {
1663: from->local.n = 0;
1664: from->local.vslots = 0;
1665: to->local.n = 0;
1666: to->local.vslots = 0;
1667: }
1668: from->local.nonmatching_computed = PETSC_FALSE;
1669: from->local.n_nonmatching = 0;
1670: from->local.slots_nonmatching = 0;
1671: to->local.nonmatching_computed = PETSC_FALSE;
1672: to->local.n_nonmatching = 0;
1673: to->local.slots_nonmatching = 0;
1675: from->type = VEC_SCATTER_MPI_GENERAL;
1676: to->type = VEC_SCATTER_MPI_GENERAL;
1677: from->bs = bs;
1678: to->bs = bs;
1680: VecScatterCreateCommon_PtoS(from,to,ctx);
1681: return(0);
1682: }
1684: /*
1685: bs indicates how many elements there are in each block. Normally this would be 1.
1686: */
1689: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1690: {
1691: MPI_Comm comm = ((PetscObject)ctx)->comm;
1692: PetscMPIInt tag = ((PetscObject)ctx)->tag, tagr;
1693: PetscInt bs = to->bs;
1694: PetscMPIInt size;
1695: PetscInt i, n;
1697:
1699: PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1700: ctx->destroy = VecScatterDestroy_PtoP;
1702: PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1703: from->sendfirst = to->sendfirst;
1705: MPI_Comm_size(comm,&size);
1706: /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1707: to->contiq = PETSC_FALSE;
1708: n = from->starts[from->n];
1709: from->contiq = PETSC_TRUE;
1710: for (i=1; i<n; i++) {
1711: if (from->indices[i] != from->indices[i-1] + bs) {
1712: from->contiq = PETSC_FALSE;
1713: break;
1714: }
1715: }
1717: PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1718: from->use_alltoallv = to->use_alltoallv;
1719: if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1720: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1721: if (to->use_alltoallv) {
1722: PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1723: }
1724: from->use_alltoallw = to->use_alltoallw;
1725: if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1726: #endif
1728: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1729: PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1730: from->use_window = to->use_window;
1731: #endif
1733: if (to->use_alltoallv) {
1735: PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1736: PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1737: for (i=0; i<to->n; i++) {
1738: to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1739: }
1740: to->displs[0] = 0;
1741: for (i=1; i<size; i++) {
1742: to->displs[i] = to->displs[i-1] + to->counts[i-1];
1743: }
1745: PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1746: PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1747: for (i=0; i<from->n; i++) {
1748: from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1749: }
1750: from->displs[0] = 0;
1751: for (i=1; i<size; i++) {
1752: from->displs[i] = from->displs[i-1] + from->counts[i-1];
1753: }
1754: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1755: if (to->use_alltoallw) {
1756: PetscMPIInt mpibs = (PetscMPIInt)bs, mpilen;
1757: ctx->packtogether = PETSC_FALSE;
1758: PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1759: PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1760: PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1761: for (i=0; i<size; i++) {
1762: to->types[i] = MPIU_SCALAR;
1763: }
1765: for (i=0; i<to->n; i++) {
1766: to->wcounts[to->procs[i]] = 1;
1767: mpilen = (PetscMPIInt) to->starts[i+1]-to->starts[i];
1768: MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1769: MPI_Type_commit(to->types+to->procs[i]);
1770: }
1771: PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1772: PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1773: PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1774: for (i=0; i<size; i++) {
1775: from->types[i] = MPIU_SCALAR;
1776: }
1777: if (from->contiq) {
1778: PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1779: for (i=0; i<from->n; i++) {
1780: from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1781: }
1782: if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1783: for (i=1; i<from->n; i++) {
1784: from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1785: }
1786: } else {
1787: for (i=0; i<from->n; i++) {
1788: from->wcounts[from->procs[i]] = 1;
1789: mpilen = (PetscMPIInt) from->starts[i+1]-from->starts[i];
1790: MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1791: MPI_Type_commit(from->types+from->procs[i]);
1792: }
1793: }
1794: }
1795: #else
1796: to->use_alltoallw = PETSC_FALSE;
1797: from->use_alltoallw = PETSC_FALSE;
1798: #endif
1799: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1800: } else if (to->use_window) {
1801: PetscMPIInt temptag,winsize;
1802: MPI_Request *request;
1803: MPI_Status *status;
1804:
1805: PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1806: winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1807: MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1808: PetscMalloc(to->n,&to->winstarts);
1809: PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1810: for (i=0; i<to->n; i++) {
1811: MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1812: }
1813: for (i=0; i<from->n; i++) {
1814: MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1815: }
1816: MPI_Waitall(to->n,request,status);
1817: PetscFree2(request,status);
1819: winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1820: MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1821: PetscMalloc(from->n,&from->winstarts);
1822: PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1823: for (i=0; i<from->n; i++) {
1824: MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1825: }
1826: for (i=0; i<to->n; i++) {
1827: MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1828: }
1829: MPI_Waitall(from->n,request,status);
1830: PetscFree2(request,status);
1831: #endif
1832: } else {
1833: PetscTruth use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1834: PetscInt *sstarts = to->starts, *rstarts = from->starts;
1835: PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
1836: MPI_Request *swaits = to->requests,*rwaits = from->requests;
1837: MPI_Request *rev_swaits,*rev_rwaits;
1838: PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
1840: /* allocate additional wait variables for the "reverse" scatter */
1841: PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1842: PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1843: to->rev_requests = rev_rwaits;
1844: from->rev_requests = rev_swaits;
1846: /* Register the receives that you will use later (sends for scatter reverse) */
1847: PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1848: PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1849: if (use_rsend) {
1850: PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1851: to->use_readyreceiver = PETSC_TRUE;
1852: from->use_readyreceiver = PETSC_TRUE;
1853: } else {
1854: to->use_readyreceiver = PETSC_FALSE;
1855: from->use_readyreceiver = PETSC_FALSE;
1856: }
1857: if (use_ssend) {
1858: PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1859: }
1861: for (i=0; i<from->n; i++) {
1862: if (use_rsend) {
1863: MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1864: } else if (use_ssend) {
1865: MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1866: } else {
1867: MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1868: }
1869: }
1871: for (i=0; i<to->n; i++) {
1872: if (use_rsend) {
1873: MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1874: } else if (use_ssend) {
1875: MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1876: } else {
1877: MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1878: }
1879: }
1880: /* Register receives for scatter and reverse */
1881: for (i=0; i<from->n; i++) {
1882: MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1883: }
1884: for (i=0; i<to->n; i++) {
1885: MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1886: }
1887: if (use_rsend) {
1888: if (to->n) {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1889: if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1890: MPI_Barrier(comm);
1891: }
1893: ctx->copy = VecScatterCopy_PtoP_X;
1894: }
1895: PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
1896: switch (bs) {
1897: case 12:
1898: ctx->begin = VecScatterBegin_12;
1899: ctx->end = VecScatterEnd_12;
1900: break;
1901: case 8:
1902: ctx->begin = VecScatterBegin_8;
1903: ctx->end = VecScatterEnd_8;
1904: break;
1905: case 7:
1906: ctx->begin = VecScatterBegin_7;
1907: ctx->end = VecScatterEnd_7;
1908: break;
1909: case 6:
1910: ctx->begin = VecScatterBegin_6;
1911: ctx->end = VecScatterEnd_6;
1912: break;
1913: case 5:
1914: ctx->begin = VecScatterBegin_5;
1915: ctx->end = VecScatterEnd_5;
1916: break;
1917: case 4:
1918: ctx->begin = VecScatterBegin_4;
1919: ctx->end = VecScatterEnd_4;
1920: break;
1921: case 3:
1922: ctx->begin = VecScatterBegin_3;
1923: ctx->end = VecScatterEnd_3;
1924: break;
1925: case 2:
1926: ctx->begin = VecScatterBegin_2;
1927: ctx->end = VecScatterEnd_2;
1928: break;
1929: case 1:
1930: ctx->begin = VecScatterBegin_1;
1931: ctx->end = VecScatterEnd_1;
1932: break;
1933: default:
1934: SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1935: }
1936: ctx->view = VecScatterView_MPI;
1938: /* Check if the local scatter is actually a copy; important special case */
1939: if (to->local.n) {
1940: VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
1941: }
1942: return(0);
1943: }
1947: /* ------------------------------------------------------------------------------------*/
1948: /*
1949: Scatter from local Seq vectors to a parallel vector.
1950: Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1951: reverses the result.
1952: */
1955: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1956: {
1957: PetscErrorCode ierr;
1958: MPI_Request *waits;
1959: VecScatter_MPI_General *to,*from;
1962: VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1963: to = (VecScatter_MPI_General*)ctx->fromdata;
1964: from = (VecScatter_MPI_General*)ctx->todata;
1965: ctx->todata = (void*)to;
1966: ctx->fromdata = (void*)from;
1967: /* these two are special, they are ALWAYS stored in to struct */
1968: to->sstatus = from->sstatus;
1969: to->rstatus = from->rstatus;
1971: from->sstatus = 0;
1972: from->rstatus = 0;
1974: waits = from->rev_requests;
1975: from->rev_requests = from->requests;
1976: from->requests = waits;
1977: waits = to->rev_requests;
1978: to->rev_requests = to->requests;
1979: to->requests = waits;
1980: return(0);
1981: }
1983: /* ---------------------------------------------------------------------------------*/
1986: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1987: {
1989: PetscMPIInt size,rank,tag,imdex,n;
1990: PetscInt *lens = PETSC_NULL,*owners = xin->map.range;
1991: PetscInt *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1992: PetscInt *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1993: PetscInt *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1994: MPI_Comm comm;
1995: MPI_Request *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1996: MPI_Status recv_status,*send_status = PETSC_NULL;
1997: PetscTruth duplicate = PETSC_FALSE;
1998: #if defined(PETSC_USE_DEBUG)
1999: PetscTruth found = PETSC_FALSE;
2000: #endif
2003: PetscObjectGetNewTag((PetscObject)ctx,&tag);
2004: PetscObjectGetComm((PetscObject)xin,&comm);
2005: MPI_Comm_size(comm,&size);
2006: MPI_Comm_rank(comm,&rank);
2007: if (size == 1) {
2008: VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
2009: return(0);
2010: }
2012: /*
2013: Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2014: They then call the StoPScatterCreate()
2015: */
2016: /* first count number of contributors to each processor */
2017: PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2018: PetscMemzero(nprocs,2*size*sizeof(PetscInt));
2019: lastidx = -1;
2020: j = 0;
2021: for (i=0; i<nx; i++) {
2022: /* if indices are NOT locally sorted, need to start search at the beginning */
2023: if (lastidx > (idx = inidx[i])) j = 0;
2024: lastidx = idx;
2025: for (; j<size; j++) {
2026: if (idx >= owners[j] && idx < owners[j+1]) {
2027: nprocs[2*j]++;
2028: nprocs[2*j+1] = 1;
2029: owner[i] = j;
2030: #if defined(PETSC_USE_DEBUG)
2031: found = PETSC_TRUE;
2032: #endif
2033: break;
2034: }
2035: }
2036: #if defined(PETSC_USE_DEBUG)
2037: if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2038: found = PETSC_FALSE;
2039: #endif
2040: }
2041: nsends = 0; for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}
2043: /* inform other processors of number of messages and max length*/
2044: PetscMaxSum(comm,nprocs,&nmax,&nrecvs);
2046: /* post receives: */
2047: PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);
2049: for (i=0; i<nrecvs; i++) {
2050: MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2051: }
2053: /* do sends:
2054: 1) starts[i] gives the starting index in svalues for stuff going to
2055: the ith processor
2056: */
2057: starts[0]= 0;
2058: for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2059: for (i=0; i<nx; i++) {
2060: svalues[2*starts[owner[i]]] = inidx[i];
2061: svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2062: }
2064: starts[0] = 0;
2065: for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2066: count = 0;
2067: for (i=0; i<size; i++) {
2068: if (nprocs[2*i+1]) {
2069: MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
2070: count++;
2071: }
2072: }
2073: PetscFree3(nprocs,owner,starts);
2075: /* wait on receives */
2076: count = nrecvs;
2077: slen = 0;
2078: while (count) {
2079: MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2080: /* unpack receives into our local space */
2081: MPI_Get_count(&recv_status,MPIU_INT,&n);
2082: lens[imdex] = n/2;
2083: slen += n/2;
2084: count--;
2085: }
2086:
2087: PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2088: base = owners[rank];
2089: count = 0;
2090: for (i=0; i<nrecvs; i++) {
2091: values = rvalues + 2*i*nmax;
2092: for (j=0; j<lens[i]; j++) {
2093: local_inidx[count] = values[2*j] - base;
2094: local_inidy[count++] = values[2*j+1];
2095: }
2096: }
2098: /* wait on sends */
2099: if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2100: PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);
2102: /*
2103: should sort and remove duplicates from local_inidx,local_inidy
2104: */
2106: #if defined(do_it_slow)
2107: /* sort on the from index */
2108: PetscSortIntWithArray(slen,local_inidx,local_inidy);
2109: start = 0;
2110: while (start < slen) {
2111: count = start+1;
2112: last = local_inidx[start];
2113: while (count < slen && last == local_inidx[count]) count++;
2114: if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2115: /* sort on to index */
2116: PetscSortInt(count-start,local_inidy+start);
2117: }
2118: /* remove duplicates; not most efficient way, but probably good enough */
2119: i = start;
2120: while (i < count-1) {
2121: if (local_inidy[i] != local_inidy[i+1]) {
2122: i++;
2123: } else { /* found a duplicate */
2124: duplicate = PETSC_TRUE;
2125: for (j=i; j<slen-1; j++) {
2126: local_inidx[j] = local_inidx[j+1];
2127: local_inidy[j] = local_inidy[j+1];
2128: }
2129: slen--;
2130: count--;
2131: }
2132: }
2133: start = count;
2134: }
2135: #endif
2136: if (duplicate) {
2137: PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2138: }
2139: VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2140: PetscFree2(local_inidx,local_inidy);
2141: return(0);
2142: }