Actual source code: vpscat.c

petsc-dev 2014-02-02
Report Typos and Errors
  2: /*
  3:     Defines parallel vector scatters.
  4: */

  6: #include <../src/vec/vec/impls/dvecimpl.h>         /*I "petscvec.h" I*/
  7: #include <../src/vec/vec/impls/mpi/pvecimpl.h>

 11: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 12: {
 13:   VecScatter_MPI_General *to  =(VecScatter_MPI_General*)ctx->todata;
 14:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 15:   PetscErrorCode         ierr;
 16:   PetscInt               i;
 17:   PetscMPIInt            rank;
 18:   PetscViewerFormat      format;
 19:   PetscBool              iascii;

 22:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 23:   if (iascii) {
 24:     MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);
 25:     PetscViewerGetFormat(viewer,&format);
 26:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 27:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 29:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 30:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 31:       itmp = to->starts[to->n+1];
 32:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 33:       itmp = from->starts[from->n+1];
 34:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));
 35:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));

 37:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 41:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 42:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 44:     } else {
 45:       PetscViewerASCIISynchronizedAllow(viewer,PETSC_TRUE);
 46:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 47:       if (to->n) {
 48:         for (i=0; i<to->n; i++) {
 49:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 50:         }
 51:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 52:         for (i=0; i<to->starts[to->n]; i++) {
 53:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);
 54:         }
 55:       }

 57:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 58:       if (from->n) {
 59:         for (i=0; i<from->n; i++) {
 60:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 61:         }

 63:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 64:         for (i=0; i<from->starts[from->n]; i++) {
 65:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);
 66:         }
 67:       }
 68:       if (to->local.n) {
 69:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);
 70:         for (i=0; i<to->local.n; i++) {
 71:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 72:         }
 73:       }

 75:       PetscViewerFlush(viewer);
 76:       PetscViewerASCIISynchronizedAllow(viewer,PETSC_FALSE);
 77:     }
 78:   }
 79:   return(0);
 80: }

 82: /* -----------------------------------------------------------------------------------*/
 83: /*
 84:       The next routine determines what part of  the local part of the scatter is an
 85:   exact copy of values into their current location. We check this here and
 86:   then know that we need not perform that portion of the scatter when the vector is
 87:   scattering to itself with INSERT_VALUES.

 89:      This is currently not used but would speed up, for example DMLocalToLocalBegin/End()

 91: */
 94: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 95: {
 96:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
 98:   PetscInt       *nto_slots,*nfrom_slots,j = 0;

101:   for (i=0; i<n; i++) {
102:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
103:   }

105:   if (!n_nonmatching) {
106:     to->nonmatching_computed = PETSC_TRUE;
107:     to->n_nonmatching        = from->n_nonmatching = 0;

109:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
110:   } else if (n_nonmatching == n) {
111:     to->nonmatching_computed = PETSC_FALSE;

113:     PetscInfo(scatter,"All values non-matching\n");
114:   } else {
115:     to->nonmatching_computed= PETSC_TRUE;
116:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;

118:     PetscMalloc1(n_nonmatching,&nto_slots);
119:     PetscMalloc1(n_nonmatching,&nfrom_slots);

121:     to->slots_nonmatching   = nto_slots;
122:     from->slots_nonmatching = nfrom_slots;
123:     for (i=0; i<n; i++) {
124:       if (to_slots[i] != from_slots[i]) {
125:         nto_slots[j]   = to_slots[i];
126:         nfrom_slots[j] = from_slots[i];
127:         j++;
128:       }
129:     }
130:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
131:   }
132:   return(0);
133: }

135: /* --------------------------------------------------------------------------------------*/

137: /* -------------------------------------------------------------------------------------*/
140: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
141: {
142:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
143:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
144:   PetscErrorCode         ierr;
145:   PetscInt               i;

148:   if (to->use_readyreceiver) {
149:     /*
150:        Since we have already posted sends we must cancel them before freeing
151:        the requests
152:     */
153:     for (i=0; i<from->n; i++) {
154:       MPI_Cancel(from->requests+i);
155:     }
156:     for (i=0; i<to->n; i++) {
157:       MPI_Cancel(to->rev_requests+i);
158:     }
159:     MPI_Waitall(from->n,from->requests,to->rstatus);
160:     MPI_Waitall(to->n,to->rev_requests,to->rstatus);
161:   }

163: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
164:   if (to->use_alltoallw) {
165:     PetscFree3(to->wcounts,to->wdispls,to->types);
166:     PetscFree3(from->wcounts,from->wdispls,from->types);
167:   }
168: #endif

170: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
171:   if (to->use_window) {
172:     MPI_Win_free(&from->window);
173:     MPI_Win_free(&to->window);
174:     PetscFree(from->winstarts);
175:     PetscFree(to->winstarts);
176:   }
177: #endif

179:   if (to->use_alltoallv) {
180:     PetscFree2(to->counts,to->displs);
181:     PetscFree2(from->counts,from->displs);
182:   }

184:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
185:   /*
186:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
187:      message passing.
188:   */
189: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
190:   if (!to->use_alltoallv && !to->use_window) {   /* currently the to->requests etc are ALWAYS allocated even if not used */
191:     if (to->requests) {
192:       for (i=0; i<to->n; i++) {
193:         MPI_Request_free(to->requests + i);
194:       }
195:     }
196:     if (to->rev_requests) {
197:       for (i=0; i<to->n; i++) {
198:         MPI_Request_free(to->rev_requests + i);
199:       }
200:     }
201:   }
202:   /*
203:       MPICH could not properly cancel requests thus with ready receiver mode we
204:     cannot free the requests. It may be fixed now, if not then put the following
205:     code inside a if (!to->use_readyreceiver) {
206:   */
207:   if (!to->use_alltoallv && !to->use_window) {    /* currently the from->requests etc are ALWAYS allocated even if not used */
208:     if (from->requests) {
209:       for (i=0; i<from->n; i++) {
210:         MPI_Request_free(from->requests + i);
211:       }
212:     }

214:     if (from->rev_requests) {
215:       for (i=0; i<from->n; i++) {
216:         MPI_Request_free(from->rev_requests + i);
217:       }
218:     }
219:   }
220: #endif

222:   PetscFree(to->local.vslots);
223:   PetscFree(from->local.vslots);
224:   PetscFree2(to->counts,to->displs);
225:   PetscFree2(from->counts,from->displs);
226:   PetscFree(to->local.slots_nonmatching);
227:   PetscFree(from->local.slots_nonmatching);
228:   PetscFree(to->rev_requests);
229:   PetscFree(from->rev_requests);
230:   PetscFree(to->requests);
231:   PetscFree(from->requests);
232:   PetscFree4(to->values,to->indices,to->starts,to->procs);
233:   PetscFree2(to->sstatus,to->rstatus);
234:   PetscFree4(from->values,from->indices,from->starts,from->procs);
235:   PetscFree(from);
236:   PetscFree(to);
237:   return(0);
238: }



242: /* --------------------------------------------------------------------------------------*/
243: /*
244:     Special optimization to see if the local part of the scatter is actually
245:     a copy. The scatter routines call PetscMemcpy() instead.

247: */
250: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
251: {
252:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
253:   PetscInt       to_start,from_start;

257:   to_start   = to_slots[0];
258:   from_start = from_slots[0];

260:   for (i=1; i<n; i++) {
261:     to_start   += bs;
262:     from_start += bs;
263:     if (to_slots[i]   != to_start)   return(0);
264:     if (from_slots[i] != from_start) return(0);
265:   }
266:   to->is_copy       = PETSC_TRUE;
267:   to->copy_start    = to_slots[0];
268:   to->copy_length   = bs*sizeof(PetscScalar)*n;
269:   from->is_copy     = PETSC_TRUE;
270:   from->copy_start  = from_slots[0];
271:   from->copy_length = bs*sizeof(PetscScalar)*n;
272:   PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
273:   return(0);
274: }

276: /* --------------------------------------------------------------------------------------*/

280: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
281: {
282:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
283:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
284:   PetscErrorCode         ierr;
285:   PetscInt               ny,bs = in_from->bs;

288:   out->begin   = in->begin;
289:   out->end     = in->end;
290:   out->copy    = in->copy;
291:   out->destroy = in->destroy;
292:   out->view    = in->view;

294:   /* allocate entire send scatter context */
295:   PetscNewLog(out,&out_to);
296:   PetscNewLog(out,&out_from);

298:   ny                = in_to->starts[in_to->n];
299:   out_to->n         = in_to->n;
300:   out_to->type      = in_to->type;
301:   out_to->sendfirst = in_to->sendfirst;

303:   PetscMalloc1(out_to->n,&out_to->requests);
304:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
305:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
306:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
307:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
308:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

310:   out->todata                        = (void*)out_to;
311:   out_to->local.n                    = in_to->local.n;
312:   out_to->local.nonmatching_computed = PETSC_FALSE;
313:   out_to->local.n_nonmatching        = 0;
314:   out_to->local.slots_nonmatching    = 0;
315:   if (in_to->local.n) {
316:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
317:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
318:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
319:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
320:   } else {
321:     out_to->local.vslots   = 0;
322:     out_from->local.vslots = 0;
323:   }

325:   /* allocate entire receive context */
326:   out_from->type      = in_from->type;
327:   ny                  = in_from->starts[in_from->n];
328:   out_from->n         = in_from->n;
329:   out_from->sendfirst = in_from->sendfirst;

331:   PetscMalloc1(out_from->n,&out_from->requests);
332:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
333:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
334:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
335:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

337:   out->fromdata                        = (void*)out_from;
338:   out_from->local.n                    = in_from->local.n;
339:   out_from->local.nonmatching_computed = PETSC_FALSE;
340:   out_from->local.n_nonmatching        = 0;
341:   out_from->local.slots_nonmatching    = 0;

343:   /*
344:       set up the request arrays for use with isend_init() and irecv_init()
345:   */
346:   {
347:     PetscMPIInt tag;
348:     MPI_Comm    comm;
349:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
350:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
351:     PetscInt    i;
352:     PetscBool   flg;
353:     MPI_Request *swaits   = out_to->requests,*rwaits  = out_from->requests;
354:     MPI_Request *rev_swaits,*rev_rwaits;
355:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

357:     PetscMalloc1(in_to->n,&out_to->rev_requests);
358:     PetscMalloc1(in_from->n,&out_from->rev_requests);

360:     rev_rwaits = out_to->rev_requests;
361:     rev_swaits = out_from->rev_requests;

363:     out_from->bs = out_to->bs = bs;
364:     tag          = ((PetscObject)out)->tag;
365:     PetscObjectGetComm((PetscObject)out,&comm);

367:     /* Register the receives that you will use later (sends for scatter reverse) */
368:     for (i=0; i<out_from->n; i++) {
369:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
370:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
371:     }

373:     flg  = PETSC_FALSE;
374:     PetscOptionsGetBool(NULL,"-vecscatter_rsend",&flg,NULL);
375:     if (flg) {
376:       out_to->use_readyreceiver   = PETSC_TRUE;
377:       out_from->use_readyreceiver = PETSC_TRUE;
378:       for (i=0; i<out_to->n; i++) {
379:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
380:       }
381:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
382:       MPI_Barrier(comm);
383:       PetscInfo(in,"Using VecScatter ready receiver mode\n");
384:     } else {
385:       out_to->use_readyreceiver   = PETSC_FALSE;
386:       out_from->use_readyreceiver = PETSC_FALSE;
387:       flg                         = PETSC_FALSE;
388:       PetscOptionsGetBool(NULL,"-vecscatter_ssend",&flg,NULL);
389:       if (flg) {
390:         PetscInfo(in,"Using VecScatter Ssend mode\n");
391:       }
392:       for (i=0; i<out_to->n; i++) {
393:         if (!flg) {
394:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
395:         } else {
396:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
397:         }
398:       }
399:     }
400:     /* Register receives for scatter reverse */
401:     for (i=0; i<out_to->n; i++) {
402:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
403:     }
404:   }
405:   return(0);
406: }

410: PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
411: {
412:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
413:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
414:   PetscErrorCode         ierr;
415:   PetscInt               ny,bs = in_from->bs;
416:   PetscMPIInt            size;

419:   MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);

421:   out->begin     = in->begin;
422:   out->end       = in->end;
423:   out->copy      = in->copy;
424:   out->destroy   = in->destroy;
425:   out->view      = in->view;

427:   /* allocate entire send scatter context */
428:   PetscNewLog(out,&out_to);
429:   PetscNewLog(out,&out_from);

431:   ny                = in_to->starts[in_to->n];
432:   out_to->n         = in_to->n;
433:   out_to->type      = in_to->type;
434:   out_to->sendfirst = in_to->sendfirst;

436:   PetscMalloc1(out_to->n,&out_to->requests);
437:   PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);
438:   PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);
439:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
440:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
441:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));

443:   out->todata                        = (void*)out_to;
444:   out_to->local.n                    = in_to->local.n;
445:   out_to->local.nonmatching_computed = PETSC_FALSE;
446:   out_to->local.n_nonmatching        = 0;
447:   out_to->local.slots_nonmatching    = 0;
448:   if (in_to->local.n) {
449:     PetscMalloc1(in_to->local.n,&out_to->local.vslots);
450:     PetscMalloc1(in_from->local.n,&out_from->local.vslots);
451:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
452:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
453:   } else {
454:     out_to->local.vslots   = 0;
455:     out_from->local.vslots = 0;
456:   }

458:   /* allocate entire receive context */
459:   out_from->type      = in_from->type;
460:   ny                  = in_from->starts[in_from->n];
461:   out_from->n         = in_from->n;
462:   out_from->sendfirst = in_from->sendfirst;

464:   PetscMalloc1(out_from->n,&out_from->requests);
465:   PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);
466:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
467:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
468:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));

470:   out->fromdata                        = (void*)out_from;
471:   out_from->local.n                    = in_from->local.n;
472:   out_from->local.nonmatching_computed = PETSC_FALSE;
473:   out_from->local.n_nonmatching        = 0;
474:   out_from->local.slots_nonmatching    = 0;

476:   out_to->use_alltoallv = out_from->use_alltoallv = PETSC_TRUE;

478:   PetscMalloc2(size,&out_to->counts,size,&out_to->displs);
479:   PetscMemcpy(out_to->counts,in_to->counts,size*sizeof(PetscMPIInt));
480:   PetscMemcpy(out_to->displs,in_to->displs,size*sizeof(PetscMPIInt));

482:   PetscMalloc2(size,&out_from->counts,size,&out_from->displs);
483:   PetscMemcpy(out_from->counts,in_from->counts,size*sizeof(PetscMPIInt));
484:   PetscMemcpy(out_from->displs,in_from->displs,size*sizeof(PetscMPIInt));
485:   return(0);
486: }
487: /* --------------------------------------------------------------------------------------------------
488:     Packs and unpacks the message data into send or from receive buffers.

490:     These could be generated automatically.

492:     Fortran kernels etc. could be used.
493: */
494: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
495: {
496:   PetscInt i;
497:   for (i=0; i<n; i++) y[i] = x[indicesx[i]];
498: }

502: PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
503: {
504:   PetscInt i;

507:   switch (addv) {
508:   case INSERT_VALUES:
509:   case INSERT_ALL_VALUES:
510:     for (i=0; i<n; i++) y[indicesy[i]] = x[i];
511:     break;
512:   case ADD_VALUES:
513:   case ADD_ALL_VALUES:
514:     for (i=0; i<n; i++) y[indicesy[i]] += x[i];
515:     break;
516: #if !defined(PETSC_USE_COMPLEX)
517:   case MAX_VALUES:
518:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
519: #else
520:   case MAX_VALUES:
521: #endif
522:   case NOT_SET_VALUES:
523:     break;
524:   default:
525:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
526:   }
527:   return(0);
528: }

532: PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
533: {
534:   PetscInt i;

537:   switch (addv) {
538:   case INSERT_VALUES:
539:   case INSERT_ALL_VALUES:
540:     for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
541:     break;
542:   case ADD_VALUES:
543:   case ADD_ALL_VALUES:
544:     for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
545:     break;
546: #if !defined(PETSC_USE_COMPLEX)
547:   case MAX_VALUES:
548:     for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
549: #else
550:   case MAX_VALUES:
551: #endif
552:   case NOT_SET_VALUES:
553:     break;
554:   default:
555:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
556:   }
557:   return(0);
558: }

560: /* ----------------------------------------------------------------------------------------------- */
561: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
562: {
563:   PetscInt i,idx;

565:   for (i=0; i<n; i++) {
566:     idx  = *indicesx++;
567:     y[0] = x[idx];
568:     y[1] = x[idx+1];
569:     y   += 2;
570:   }
571: }

575: PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
576: {
577:   PetscInt i,idy;

580:   switch (addv) {
581:   case INSERT_VALUES:
582:   case INSERT_ALL_VALUES:
583:     for (i=0; i<n; i++) {
584:       idy      = *indicesy++;
585:       y[idy]   = x[0];
586:       y[idy+1] = x[1];
587:       x       += 2;
588:     }
589:     break;
590:   case ADD_VALUES:
591:   case ADD_ALL_VALUES:
592:     for (i=0; i<n; i++) {
593:       idy       = *indicesy++;
594:       y[idy]   += x[0];
595:       y[idy+1] += x[1];
596:       x        += 2;
597:     }
598:     break;
599: #if !defined(PETSC_USE_COMPLEX)
600:   case MAX_VALUES:
601:     for (i=0; i<n; i++) {
602:       idy      = *indicesy++;
603:       y[idy]   = PetscMax(y[idy],x[0]);
604:       y[idy+1] = PetscMax(y[idy+1],x[1]);
605:       x       += 2;
606:     }
607: #else
608:   case MAX_VALUES:
609: #endif
610:   case NOT_SET_VALUES:
611:     break;
612:   default:
613:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
614:   }
615:   return(0);
616: }

620: PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
621: {
622:   PetscInt i,idx,idy;

625:   switch (addv) {
626:   case INSERT_VALUES:
627:   case INSERT_ALL_VALUES:
628:     for (i=0; i<n; i++) {
629:       idx      = *indicesx++;
630:       idy      = *indicesy++;
631:       y[idy]   = x[idx];
632:       y[idy+1] = x[idx+1];
633:     }
634:     break;
635:   case ADD_VALUES:
636:   case ADD_ALL_VALUES:
637:     for (i=0; i<n; i++) {
638:       idx       = *indicesx++;
639:       idy       = *indicesy++;
640:       y[idy]   += x[idx];
641:       y[idy+1] += x[idx+1];
642:     }
643:     break;
644: #if !defined(PETSC_USE_COMPLEX)
645:   case MAX_VALUES:
646:     for (i=0; i<n; i++) {
647:       idx      = *indicesx++;
648:       idy      = *indicesy++;
649:       y[idy]   = PetscMax(y[idy],x[idx]);
650:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
651:     }
652: #else
653:   case MAX_VALUES:
654: #endif
655:   case NOT_SET_VALUES:
656:     break;
657:   default:
658:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
659:   }
660:   return(0);
661: }
662: /* ----------------------------------------------------------------------------------------------- */
663: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
664: {
665:   PetscInt i,idx;

667:   for (i=0; i<n; i++) {
668:     idx  = *indicesx++;
669:     y[0] = x[idx];
670:     y[1] = x[idx+1];
671:     y[2] = x[idx+2];
672:     y   += 3;
673:   }
674: }
677: PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
678: {
679:   PetscInt i,idy;

682:   switch (addv) {
683:   case INSERT_VALUES:
684:   case INSERT_ALL_VALUES:
685:     for (i=0; i<n; i++) {
686:       idy      = *indicesy++;
687:       y[idy]   = x[0];
688:       y[idy+1] = x[1];
689:       y[idy+2] = x[2];
690:       x       += 3;
691:     }
692:     break;
693:   case ADD_VALUES:
694:   case ADD_ALL_VALUES:
695:     for (i=0; i<n; i++) {
696:       idy       = *indicesy++;
697:       y[idy]   += x[0];
698:       y[idy+1] += x[1];
699:       y[idy+2] += x[2];
700:       x        += 3;
701:     }
702:     break;
703: #if !defined(PETSC_USE_COMPLEX)
704:   case MAX_VALUES:
705:     for (i=0; i<n; i++) {
706:       idy      = *indicesy++;
707:       y[idy]   = PetscMax(y[idy],x[0]);
708:       y[idy+1] = PetscMax(y[idy+1],x[1]);
709:       y[idy+2] = PetscMax(y[idy+2],x[2]);
710:       x       += 3;
711:     }
712: #else
713:   case MAX_VALUES:
714: #endif
715:   case NOT_SET_VALUES:
716:     break;
717:   default:
718:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
719:   }
720:   return(0);
721: }

725: PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
726: {
727:   PetscInt i,idx,idy;

730:   switch (addv) {
731:   case INSERT_VALUES:
732:   case INSERT_ALL_VALUES:
733:     for (i=0; i<n; i++) {
734:       idx      = *indicesx++;
735:       idy      = *indicesy++;
736:       y[idy]   = x[idx];
737:       y[idy+1] = x[idx+1];
738:       y[idy+2] = x[idx+2];
739:     }
740:     break;
741:   case ADD_VALUES:
742:   case ADD_ALL_VALUES:
743:     for (i=0; i<n; i++) {
744:       idx       = *indicesx++;
745:       idy       = *indicesy++;
746:       y[idy]   += x[idx];
747:       y[idy+1] += x[idx+1];
748:       y[idy+2] += x[idx+2];
749:     }
750:     break;
751: #if !defined(PETSC_USE_COMPLEX)
752:   case MAX_VALUES:
753:     for (i=0; i<n; i++) {
754:       idx      = *indicesx++;
755:       idy      = *indicesy++;
756:       y[idy]   = PetscMax(y[idy],x[idx]);
757:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
758:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
759:     }
760: #else
761:   case MAX_VALUES:
762: #endif
763:   case NOT_SET_VALUES:
764:     break;
765:   default:
766:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
767:   }
768:   return(0);
769: }
770: /* ----------------------------------------------------------------------------------------------- */
771: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
772: {
773:   PetscInt i,idx;

775:   for (i=0; i<n; i++) {
776:     idx  = *indicesx++;
777:     y[0] = x[idx];
778:     y[1] = x[idx+1];
779:     y[2] = x[idx+2];
780:     y[3] = x[idx+3];
781:     y   += 4;
782:   }
783: }
786: PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
787: {
788:   PetscInt i,idy;

791:   switch (addv) {
792:   case INSERT_VALUES:
793:   case INSERT_ALL_VALUES:
794:     for (i=0; i<n; i++) {
795:       idy      = *indicesy++;
796:       y[idy]   = x[0];
797:       y[idy+1] = x[1];
798:       y[idy+2] = x[2];
799:       y[idy+3] = x[3];
800:       x       += 4;
801:     }
802:     break;
803:   case ADD_VALUES:
804:   case ADD_ALL_VALUES:
805:     for (i=0; i<n; i++) {
806:       idy       = *indicesy++;
807:       y[idy]   += x[0];
808:       y[idy+1] += x[1];
809:       y[idy+2] += x[2];
810:       y[idy+3] += x[3];
811:       x        += 4;
812:     }
813:     break;
814: #if !defined(PETSC_USE_COMPLEX)
815:   case MAX_VALUES:
816:     for (i=0; i<n; i++) {
817:       idy      = *indicesy++;
818:       y[idy]   = PetscMax(y[idy],x[0]);
819:       y[idy+1] = PetscMax(y[idy+1],x[1]);
820:       y[idy+2] = PetscMax(y[idy+2],x[2]);
821:       y[idy+3] = PetscMax(y[idy+3],x[3]);
822:       x       += 4;
823:     }
824: #else
825:   case MAX_VALUES:
826: #endif
827:   case NOT_SET_VALUES:
828:     break;
829:   default:
830:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
831:   }
832:   return(0);
833: }

837: PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
838: {
839:   PetscInt i,idx,idy;

842:   switch (addv) {
843:   case INSERT_VALUES:
844:   case INSERT_ALL_VALUES:
845:     for (i=0; i<n; i++) {
846:       idx      = *indicesx++;
847:       idy      = *indicesy++;
848:       y[idy]   = x[idx];
849:       y[idy+1] = x[idx+1];
850:       y[idy+2] = x[idx+2];
851:       y[idy+3] = x[idx+3];
852:     }
853:     break;
854:   case ADD_VALUES:
855:   case ADD_ALL_VALUES:
856:     for (i=0; i<n; i++) {
857:       idx       = *indicesx++;
858:       idy       = *indicesy++;
859:       y[idy]   += x[idx];
860:       y[idy+1] += x[idx+1];
861:       y[idy+2] += x[idx+2];
862:       y[idy+3] += x[idx+3];
863:     }
864:     break;
865: #if !defined(PETSC_USE_COMPLEX)
866:   case MAX_VALUES:
867:     for (i=0; i<n; i++) {
868:       idx      = *indicesx++;
869:       idy      = *indicesy++;
870:       y[idy]   = PetscMax(y[idy],x[idx]);
871:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
872:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
873:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
874:     }
875: #else
876:   case MAX_VALUES:
877: #endif
878:   case NOT_SET_VALUES:
879:     break;
880:   default:
881:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
882:   }
883:   return(0);
884: }
885: /* ----------------------------------------------------------------------------------------------- */
886: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
887: {
888:   PetscInt i,idx;

890:   for (i=0; i<n; i++) {
891:     idx  = *indicesx++;
892:     y[0] = x[idx];
893:     y[1] = x[idx+1];
894:     y[2] = x[idx+2];
895:     y[3] = x[idx+3];
896:     y[4] = x[idx+4];
897:     y   += 5;
898:   }
899: }

903: PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
904: {
905:   PetscInt i,idy;

908:   switch (addv) {
909:   case INSERT_VALUES:
910:   case INSERT_ALL_VALUES:
911:     for (i=0; i<n; i++) {
912:       idy      = *indicesy++;
913:       y[idy]   = x[0];
914:       y[idy+1] = x[1];
915:       y[idy+2] = x[2];
916:       y[idy+3] = x[3];
917:       y[idy+4] = x[4];
918:       x       += 5;
919:     }
920:     break;
921:   case ADD_VALUES:
922:   case ADD_ALL_VALUES:
923:     for (i=0; i<n; i++) {
924:       idy       = *indicesy++;
925:       y[idy]   += x[0];
926:       y[idy+1] += x[1];
927:       y[idy+2] += x[2];
928:       y[idy+3] += x[3];
929:       y[idy+4] += x[4];
930:       x        += 5;
931:     }
932:     break;
933: #if !defined(PETSC_USE_COMPLEX)
934:   case MAX_VALUES:
935:     for (i=0; i<n; i++) {
936:       idy      = *indicesy++;
937:       y[idy]   = PetscMax(y[idy],x[0]);
938:       y[idy+1] = PetscMax(y[idy+1],x[1]);
939:       y[idy+2] = PetscMax(y[idy+2],x[2]);
940:       y[idy+3] = PetscMax(y[idy+3],x[3]);
941:       y[idy+4] = PetscMax(y[idy+4],x[4]);
942:       x       += 5;
943:     }
944: #else
945:   case MAX_VALUES:
946: #endif
947:   case NOT_SET_VALUES:
948:     break;
949:   default:
950:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
951:   }
952:   return(0);
953: }

957: PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
958: {
959:   PetscInt i,idx,idy;

962:   switch (addv) {
963:   case INSERT_VALUES:
964:   case INSERT_ALL_VALUES:
965:     for (i=0; i<n; i++) {
966:       idx      = *indicesx++;
967:       idy      = *indicesy++;
968:       y[idy]   = x[idx];
969:       y[idy+1] = x[idx+1];
970:       y[idy+2] = x[idx+2];
971:       y[idy+3] = x[idx+3];
972:       y[idy+4] = x[idx+4];
973:     }
974:     break;
975:   case ADD_VALUES:
976:   case ADD_ALL_VALUES:
977:     for (i=0; i<n; i++) {
978:       idx       = *indicesx++;
979:       idy       = *indicesy++;
980:       y[idy]   += x[idx];
981:       y[idy+1] += x[idx+1];
982:       y[idy+2] += x[idx+2];
983:       y[idy+3] += x[idx+3];
984:       y[idy+4] += x[idx+4];
985:     }
986:     break;
987: #if !defined(PETSC_USE_COMPLEX)
988:   case MAX_VALUES:
989:     for (i=0; i<n; i++) {
990:       idx      = *indicesx++;
991:       idy      = *indicesy++;
992:       y[idy]   = PetscMax(y[idy],x[idx]);
993:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
994:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
995:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
996:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
997:     }
998: #else
999:   case MAX_VALUES:
1000: #endif
1001:   case NOT_SET_VALUES:
1002:     break;
1003:   default:
1004:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1005:   }
1006:   return(0);
1007: }
1008: /* ----------------------------------------------------------------------------------------------- */
1009: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1010: {
1011:   PetscInt i,idx;

1013:   for (i=0; i<n; i++) {
1014:     idx  = *indicesx++;
1015:     y[0] = x[idx];
1016:     y[1] = x[idx+1];
1017:     y[2] = x[idx+2];
1018:     y[3] = x[idx+3];
1019:     y[4] = x[idx+4];
1020:     y[5] = x[idx+5];
1021:     y   += 6;
1022:   }
1023: }

1027: PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1028: {
1029:   PetscInt i,idy;

1032:   switch (addv) {
1033:   case INSERT_VALUES:
1034:   case INSERT_ALL_VALUES:
1035:     for (i=0; i<n; i++) {
1036:       idy      = *indicesy++;
1037:       y[idy]   = x[0];
1038:       y[idy+1] = x[1];
1039:       y[idy+2] = x[2];
1040:       y[idy+3] = x[3];
1041:       y[idy+4] = x[4];
1042:       y[idy+5] = x[5];
1043:       x       += 6;
1044:     }
1045:     break;
1046:   case ADD_VALUES:
1047:   case ADD_ALL_VALUES:
1048:     for (i=0; i<n; i++) {
1049:       idy       = *indicesy++;
1050:       y[idy]   += x[0];
1051:       y[idy+1] += x[1];
1052:       y[idy+2] += x[2];
1053:       y[idy+3] += x[3];
1054:       y[idy+4] += x[4];
1055:       y[idy+5] += x[5];
1056:       x        += 6;
1057:     }
1058:     break;
1059: #if !defined(PETSC_USE_COMPLEX)
1060:   case MAX_VALUES:
1061:     for (i=0; i<n; i++) {
1062:       idy      = *indicesy++;
1063:       y[idy]   = PetscMax(y[idy],x[0]);
1064:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1065:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1066:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1067:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1068:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1069:       x       += 6;
1070:     }
1071: #else
1072:   case MAX_VALUES:
1073: #endif
1074:   case NOT_SET_VALUES:
1075:     break;
1076:   default:
1077:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1078:   }
1079:   return(0);
1080: }

1084: PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1085: {
1086:   PetscInt i,idx,idy;

1089:   switch (addv) {
1090:   case INSERT_VALUES:
1091:   case INSERT_ALL_VALUES:
1092:     for (i=0; i<n; i++) {
1093:       idx      = *indicesx++;
1094:       idy      = *indicesy++;
1095:       y[idy]   = x[idx];
1096:       y[idy+1] = x[idx+1];
1097:       y[idy+2] = x[idx+2];
1098:       y[idy+3] = x[idx+3];
1099:       y[idy+4] = x[idx+4];
1100:       y[idy+5] = x[idx+5];
1101:     }
1102:     break;
1103:   case ADD_VALUES:
1104:   case ADD_ALL_VALUES:
1105:     for (i=0; i<n; i++) {
1106:       idx       = *indicesx++;
1107:       idy       = *indicesy++;
1108:       y[idy]   += x[idx];
1109:       y[idy+1] += x[idx+1];
1110:       y[idy+2] += x[idx+2];
1111:       y[idy+3] += x[idx+3];
1112:       y[idy+4] += x[idx+4];
1113:       y[idy+5] += x[idx+5];
1114:     }
1115:     break;
1116: #if !defined(PETSC_USE_COMPLEX)
1117:   case MAX_VALUES:
1118:     for (i=0; i<n; i++) {
1119:       idx      = *indicesx++;
1120:       idy      = *indicesy++;
1121:       y[idy]   = PetscMax(y[idy],x[idx]);
1122:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1123:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1124:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1125:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1126:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1127:     }
1128: #else
1129:   case MAX_VALUES:
1130: #endif
1131:   case NOT_SET_VALUES:
1132:     break;
1133:   default:
1134:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1135:   }
1136:   return(0);
1137: }
1138: /* ----------------------------------------------------------------------------------------------- */
1139: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1140: {
1141:   PetscInt i,idx;

1143:   for (i=0; i<n; i++) {
1144:     idx  = *indicesx++;
1145:     y[0] = x[idx];
1146:     y[1] = x[idx+1];
1147:     y[2] = x[idx+2];
1148:     y[3] = x[idx+3];
1149:     y[4] = x[idx+4];
1150:     y[5] = x[idx+5];
1151:     y[6] = x[idx+6];
1152:     y   += 7;
1153:   }
1154: }

1158: PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1159: {
1160:   PetscInt i,idy;

1163:   switch (addv) {
1164:   case INSERT_VALUES:
1165:   case INSERT_ALL_VALUES:
1166:     for (i=0; i<n; i++) {
1167:       idy      = *indicesy++;
1168:       y[idy]   = x[0];
1169:       y[idy+1] = x[1];
1170:       y[idy+2] = x[2];
1171:       y[idy+3] = x[3];
1172:       y[idy+4] = x[4];
1173:       y[idy+5] = x[5];
1174:       y[idy+6] = x[6];
1175:       x       += 7;
1176:     }
1177:     break;
1178:   case ADD_VALUES:
1179:   case ADD_ALL_VALUES:
1180:     for (i=0; i<n; i++) {
1181:       idy       = *indicesy++;
1182:       y[idy]   += x[0];
1183:       y[idy+1] += x[1];
1184:       y[idy+2] += x[2];
1185:       y[idy+3] += x[3];
1186:       y[idy+4] += x[4];
1187:       y[idy+5] += x[5];
1188:       y[idy+6] += x[6];
1189:       x        += 7;
1190:     }
1191:     break;
1192: #if !defined(PETSC_USE_COMPLEX)
1193:   case MAX_VALUES:
1194:     for (i=0; i<n; i++) {
1195:       idy      = *indicesy++;
1196:       y[idy]   = PetscMax(y[idy],x[0]);
1197:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1198:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1199:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1200:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1201:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1202:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1203:       x       += 7;
1204:     }
1205: #else
1206:   case MAX_VALUES:
1207: #endif
1208:   case NOT_SET_VALUES:
1209:     break;
1210:   default:
1211:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1212:   }
1213:   return(0);
1214: }

1218: PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1219: {
1220:   PetscInt i,idx,idy;

1223:   switch (addv) {
1224:   case INSERT_VALUES:
1225:   case INSERT_ALL_VALUES:
1226:     for (i=0; i<n; i++) {
1227:       idx      = *indicesx++;
1228:       idy      = *indicesy++;
1229:       y[idy]   = x[idx];
1230:       y[idy+1] = x[idx+1];
1231:       y[idy+2] = x[idx+2];
1232:       y[idy+3] = x[idx+3];
1233:       y[idy+4] = x[idx+4];
1234:       y[idy+5] = x[idx+5];
1235:       y[idy+6] = x[idx+6];
1236:     }
1237:     break;
1238:   case ADD_VALUES:
1239:   case ADD_ALL_VALUES:
1240:     for (i=0; i<n; i++) {
1241:       idx       = *indicesx++;
1242:       idy       = *indicesy++;
1243:       y[idy]   += x[idx];
1244:       y[idy+1] += x[idx+1];
1245:       y[idy+2] += x[idx+2];
1246:       y[idy+3] += x[idx+3];
1247:       y[idy+4] += x[idx+4];
1248:       y[idy+5] += x[idx+5];
1249:       y[idy+6] += x[idx+6];
1250:     }
1251:     break;
1252: #if !defined(PETSC_USE_COMPLEX)
1253:   case MAX_VALUES:
1254:     for (i=0; i<n; i++) {
1255:       idx      = *indicesx++;
1256:       idy      = *indicesy++;
1257:       y[idy]   = PetscMax(y[idy],x[idx]);
1258:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1259:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1260:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1261:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1262:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1263:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1264:     }
1265: #else
1266:   case MAX_VALUES:
1267: #endif
1268:   case NOT_SET_VALUES:
1269:     break;
1270:   default:
1271:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1272:   }
1273:   return(0);
1274: }
1275: /* ----------------------------------------------------------------------------------------------- */
1276: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1277: {
1278:   PetscInt i,idx;

1280:   for (i=0; i<n; i++) {
1281:     idx  = *indicesx++;
1282:     y[0] = x[idx];
1283:     y[1] = x[idx+1];
1284:     y[2] = x[idx+2];
1285:     y[3] = x[idx+3];
1286:     y[4] = x[idx+4];
1287:     y[5] = x[idx+5];
1288:     y[6] = x[idx+6];
1289:     y[7] = x[idx+7];
1290:     y   += 8;
1291:   }
1292: }

1296: PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1297: {
1298:   PetscInt i,idy;

1301:   switch (addv) {
1302:   case INSERT_VALUES:
1303:   case INSERT_ALL_VALUES:
1304:     for (i=0; i<n; i++) {
1305:       idy      = *indicesy++;
1306:       y[idy]   = x[0];
1307:       y[idy+1] = x[1];
1308:       y[idy+2] = x[2];
1309:       y[idy+3] = x[3];
1310:       y[idy+4] = x[4];
1311:       y[idy+5] = x[5];
1312:       y[idy+6] = x[6];
1313:       y[idy+7] = x[7];
1314:       x       += 8;
1315:     }
1316:     break;
1317:   case ADD_VALUES:
1318:   case ADD_ALL_VALUES:
1319:     for (i=0; i<n; i++) {
1320:       idy       = *indicesy++;
1321:       y[idy]   += x[0];
1322:       y[idy+1] += x[1];
1323:       y[idy+2] += x[2];
1324:       y[idy+3] += x[3];
1325:       y[idy+4] += x[4];
1326:       y[idy+5] += x[5];
1327:       y[idy+6] += x[6];
1328:       y[idy+7] += x[7];
1329:       x        += 8;
1330:     }
1331:     break;
1332: #if !defined(PETSC_USE_COMPLEX)
1333:   case MAX_VALUES:
1334:     for (i=0; i<n; i++) {
1335:       idy      = *indicesy++;
1336:       y[idy]   = PetscMax(y[idy],x[0]);
1337:       y[idy+1] = PetscMax(y[idy+1],x[1]);
1338:       y[idy+2] = PetscMax(y[idy+2],x[2]);
1339:       y[idy+3] = PetscMax(y[idy+3],x[3]);
1340:       y[idy+4] = PetscMax(y[idy+4],x[4]);
1341:       y[idy+5] = PetscMax(y[idy+5],x[5]);
1342:       y[idy+6] = PetscMax(y[idy+6],x[6]);
1343:       y[idy+7] = PetscMax(y[idy+7],x[7]);
1344:       x       += 8;
1345:     }
1346: #else
1347:   case MAX_VALUES:
1348: #endif
1349:   case NOT_SET_VALUES:
1350:     break;
1351:   default:
1352:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1353:   }
1354:   return(0);
1355: }

1359: PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1360: {
1361:   PetscInt i,idx,idy;

1364:   switch (addv) {
1365:   case INSERT_VALUES:
1366:   case INSERT_ALL_VALUES:
1367:     for (i=0; i<n; i++) {
1368:       idx      = *indicesx++;
1369:       idy      = *indicesy++;
1370:       y[idy]   = x[idx];
1371:       y[idy+1] = x[idx+1];
1372:       y[idy+2] = x[idx+2];
1373:       y[idy+3] = x[idx+3];
1374:       y[idy+4] = x[idx+4];
1375:       y[idy+5] = x[idx+5];
1376:       y[idy+6] = x[idx+6];
1377:       y[idy+7] = x[idx+7];
1378:     }
1379:     break;
1380:   case ADD_VALUES:
1381:   case ADD_ALL_VALUES:
1382:     for (i=0; i<n; i++) {
1383:       idx       = *indicesx++;
1384:       idy       = *indicesy++;
1385:       y[idy]   += x[idx];
1386:       y[idy+1] += x[idx+1];
1387:       y[idy+2] += x[idx+2];
1388:       y[idy+3] += x[idx+3];
1389:       y[idy+4] += x[idx+4];
1390:       y[idy+5] += x[idx+5];
1391:       y[idy+6] += x[idx+6];
1392:       y[idy+7] += x[idx+7];
1393:     }
1394:     break;
1395: #if !defined(PETSC_USE_COMPLEX)
1396:   case MAX_VALUES:
1397:     for (i=0; i<n; i++) {
1398:       idx      = *indicesx++;
1399:       idy      = *indicesy++;
1400:       y[idy]   = PetscMax(y[idy],x[idx]);
1401:       y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1402:       y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1403:       y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1404:       y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1405:       y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1406:       y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1407:       y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1408:     }
1409: #else
1410:   case MAX_VALUES:
1411: #endif
1412:   case NOT_SET_VALUES:
1413:     break;
1414:   default:
1415:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1416:   }
1417:   return(0);
1418: }

1420: /* ----------------------------------------------------------------------------------------------- */
1421: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1422: {
1423:   PetscInt i,idx;

1425:   for (i=0; i<n; i++) {
1426:     idx   = *indicesx++;
1427:     y[0]  = x[idx];
1428:     y[1]  = x[idx+1];
1429:     y[2]  = x[idx+2];
1430:     y[3]  = x[idx+3];
1431:     y[4]  = x[idx+4];
1432:     y[5]  = x[idx+5];
1433:     y[6]  = x[idx+6];
1434:     y[7]  = x[idx+7];
1435:     y[8]  = x[idx+8];
1436:     y[9]  = x[idx+9];
1437:     y[10] = x[idx+10];
1438:     y[11] = x[idx+11];
1439:     y    += 12;
1440:   }
1441: }

1445: PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1446: {
1447:   PetscInt i,idy;

1450:   switch (addv) {
1451:   case INSERT_VALUES:
1452:   case INSERT_ALL_VALUES:
1453:     for (i=0; i<n; i++) {
1454:       idy       = *indicesy++;
1455:       y[idy]    = x[0];
1456:       y[idy+1]  = x[1];
1457:       y[idy+2]  = x[2];
1458:       y[idy+3]  = x[3];
1459:       y[idy+4]  = x[4];
1460:       y[idy+5]  = x[5];
1461:       y[idy+6]  = x[6];
1462:       y[idy+7]  = x[7];
1463:       y[idy+8]  = x[8];
1464:       y[idy+9]  = x[9];
1465:       y[idy+10] = x[10];
1466:       y[idy+11] = x[11];
1467:       x        += 12;
1468:     }
1469:     break;
1470:   case ADD_VALUES:
1471:   case ADD_ALL_VALUES:
1472:     for (i=0; i<n; i++) {
1473:       idy        = *indicesy++;
1474:       y[idy]    += x[0];
1475:       y[idy+1]  += x[1];
1476:       y[idy+2]  += x[2];
1477:       y[idy+3]  += x[3];
1478:       y[idy+4]  += x[4];
1479:       y[idy+5]  += x[5];
1480:       y[idy+6]  += x[6];
1481:       y[idy+7]  += x[7];
1482:       y[idy+8]  += x[8];
1483:       y[idy+9]  += x[9];
1484:       y[idy+10] += x[10];
1485:       y[idy+11] += x[11];
1486:       x         += 12;
1487:     }
1488:     break;
1489: #if !defined(PETSC_USE_COMPLEX)
1490:   case MAX_VALUES:
1491:     for (i=0; i<n; i++) {
1492:       idy       = *indicesy++;
1493:       y[idy]    = PetscMax(y[idy],x[0]);
1494:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1495:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1496:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1497:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1498:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1499:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1500:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1501:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1502:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1503:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1504:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1505:       x        += 12;
1506:     }
1507: #else
1508:   case MAX_VALUES:
1509: #endif
1510:   case NOT_SET_VALUES:
1511:     break;
1512:   default:
1513:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1514:   }
1515:   return(0);
1516: }

1520: PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1521: {
1522:   PetscInt i,idx,idy;

1525:   switch (addv) {
1526:   case INSERT_VALUES:
1527:   case INSERT_ALL_VALUES:
1528:     for (i=0; i<n; i++) {
1529:       idx       = *indicesx++;
1530:       idy       = *indicesy++;
1531:       y[idy]    = x[idx];
1532:       y[idy+1]  = x[idx+1];
1533:       y[idy+2]  = x[idx+2];
1534:       y[idy+3]  = x[idx+3];
1535:       y[idy+4]  = x[idx+4];
1536:       y[idy+5]  = x[idx+5];
1537:       y[idy+6]  = x[idx+6];
1538:       y[idy+7]  = x[idx+7];
1539:       y[idy+8]  = x[idx+8];
1540:       y[idy+9]  = x[idx+9];
1541:       y[idy+10] = x[idx+10];
1542:       y[idy+11] = x[idx+11];
1543:     }
1544:     break;
1545:   case ADD_VALUES:
1546:   case ADD_ALL_VALUES:
1547:     for (i=0; i<n; i++) {
1548:       idx        = *indicesx++;
1549:       idy        = *indicesy++;
1550:       y[idy]    += x[idx];
1551:       y[idy+1]  += x[idx+1];
1552:       y[idy+2]  += x[idx+2];
1553:       y[idy+3]  += x[idx+3];
1554:       y[idy+4]  += x[idx+4];
1555:       y[idy+5]  += x[idx+5];
1556:       y[idy+6]  += x[idx+6];
1557:       y[idy+7]  += x[idx+7];
1558:       y[idy+8]  += x[idx+8];
1559:       y[idy+9]  += x[idx+9];
1560:       y[idy+10] += x[idx+10];
1561:       y[idy+11] += x[idx+11];
1562:     }
1563:     break;
1564: #if !defined(PETSC_USE_COMPLEX)
1565:   case MAX_VALUES:
1566:     for (i=0; i<n; i++) {
1567:       idx       = *indicesx++;
1568:       idy       = *indicesy++;
1569:       y[idy]    = PetscMax(y[idy],x[idx]);
1570:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1571:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1572:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1573:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1574:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1575:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1576:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1577:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1578:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1579:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1580:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1581:     }
1582: #else
1583:   case MAX_VALUES:
1584: #endif
1585:   case NOT_SET_VALUES:
1586:     break;
1587:   default:
1588:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1589:   }
1590:   return(0);
1591: }

1593: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1594: #define BS 1
1595: #include <../src/vec/vec/utils/vpscat.h>
1596: #define BS 2
1597: #include <../src/vec/vec/utils/vpscat.h>
1598: #define BS 3
1599: #include <../src/vec/vec/utils/vpscat.h>
1600: #define BS 4
1601: #include <../src/vec/vec/utils/vpscat.h>
1602: #define BS 5
1603: #include <../src/vec/vec/utils/vpscat.h>
1604: #define BS 6
1605: #include <../src/vec/vec/utils/vpscat.h>
1606: #define BS 7
1607: #include <../src/vec/vec/utils/vpscat.h>
1608: #define BS 8
1609: #include <../src/vec/vec/utils/vpscat.h>
1610: #define BS 12
1611: #include <../src/vec/vec/utils/vpscat.h>

1613: /* ==========================================================================================*/

1615: /*              create parallel to sequential scatter context                           */

1617: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);

1621: /*@
1622:      VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send and receive.

1624:      Collective on VecScatter

1626:    Input Parameters:
1627: +     VecScatter - obtained with VecScatterCreateEmpty()
1628: .     nsends -
1629: .     sendSizes -
1630: .     sendProcs -
1631: .     sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1632: .     nrecvs - number of receives to expect
1633: .     recvSizes -
1634: .     recvProcs - processes who are sending to me
1635: .     recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1636: -     bs - size of block

1638:      Notes:  sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1639:       to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.

1641:        Probably does not handle sends to self properly. It should remove those from the counts that are used
1642:       in allocating space inside of the from struct

1644:   Level: intermediate

1646: @*/
1647: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1648: {
1649:   VecScatter_MPI_General *from, *to;
1650:   PetscInt               sendSize, recvSize;
1651:   PetscInt               n, i;
1652:   PetscErrorCode         ierr;

1654:   /* allocate entire send scatter context */
1655:   PetscNewLog(ctx,&to);
1656:   to->n = nsends;
1657:   for (n = 0, sendSize = 0; n < to->n; n++) sendSize += sendSizes[n];

1659:   PetscMalloc1(to->n,&to->requests);
1660:   PetscMalloc4(bs*sendSize,&to->values,sendSize,&to->indices,to->n+1,&to->starts,to->n,&to->procs);
1661:   PetscMalloc2(PetscMax(to->n,nrecvs),&to->sstatus,PetscMax(to->n,nrecvs),&to->rstatus);

1663:   to->starts[0] = 0;
1664:   for (n = 0; n < to->n; n++) {
1665:     if (sendSizes[n] <=0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1666:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1667:     to->procs[n]    = sendProcs[n];
1668:     for (i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) to->indices[i] = sendIdx[i];
1669:   }
1670:   ctx->todata = (void*) to;

1672:   /* allocate entire receive scatter context */
1673:   PetscNewLog(ctx,&from);
1674:   from->n = nrecvs;
1675:   for (n = 0, recvSize = 0; n < from->n; n++) recvSize += recvSizes[n];

1677:   PetscMalloc1(from->n,&from->requests);
1678:   PetscMalloc4(bs*recvSize,&from->values,recvSize,&from->indices,from->n+1,&from->starts,from->n,&from->procs);

1680:   from->starts[0] = 0;
1681:   for (n = 0; n < from->n; n++) {
1682:     if (recvSizes[n] <=0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1683:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1684:     from->procs[n]    = recvProcs[n];
1685:     for (i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) from->indices[i] = recvIdx[i];
1686:   }
1687:   ctx->fromdata = (void*)from;

1689:   /* No local scatter optimization */
1690:   from->local.n                    = 0;
1691:   from->local.vslots               = 0;
1692:   to->local.n                      = 0;
1693:   to->local.vslots                 = 0;
1694:   from->local.nonmatching_computed = PETSC_FALSE;
1695:   from->local.n_nonmatching        = 0;
1696:   from->local.slots_nonmatching    = 0;
1697:   to->local.nonmatching_computed   = PETSC_FALSE;
1698:   to->local.n_nonmatching          = 0;
1699:   to->local.slots_nonmatching      = 0;

1701:   from->type = VEC_SCATTER_MPI_GENERAL;
1702:   to->type   = VEC_SCATTER_MPI_GENERAL;
1703:   from->bs   = bs;
1704:   to->bs     = bs;
1705:   VecScatterCreateCommon_PtoS(from, to, ctx);

1707:   /* mark lengths as negative so it won't check local vector lengths */
1708:   ctx->from_n = ctx->to_n = -1;
1709:   return(0);
1710: }

1712: /*
1713:    bs indicates how many elements there are in each block. Normally this would be 1.

1715:    contains check that PetscMPIInt can handle the sizes needed
1716: */
1719: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1720: {
1721:   VecScatter_MPI_General *from,*to;
1722:   PetscMPIInt            size,rank,imdex,tag,n;
1723:   PetscInt               *source = NULL,*owners = NULL;
1724:   PetscInt               *lowner = NULL,*start = NULL,lengthy,lengthx;
1725:   PetscMPIInt            *nprocs = NULL,nrecvs;
1726:   PetscInt               i,j,idx,nsends;
1727:   PetscInt               *owner = NULL,*starts = NULL,count,slen;
1728:   PetscInt               *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
1729:   PetscMPIInt            *onodes1,*olengths1;
1730:   MPI_Comm               comm;
1731:   MPI_Request            *send_waits = NULL,*recv_waits = NULL;
1732:   MPI_Status             recv_status,*send_status;
1733:   PetscErrorCode         ierr;

1736:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1737:   PetscObjectGetComm((PetscObject)xin,&comm);
1738:   MPI_Comm_rank(comm,&rank);
1739:   MPI_Comm_size(comm,&size);
1740:   owners = xin->map->range;
1741:   VecGetSize(yin,&lengthy);
1742:   VecGetSize(xin,&lengthx);

1744:   /*  first count number of contributors to each processor */
1745:   PetscMalloc2(size,&nprocs,nx,&owner);
1746:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

1748:   j      = 0;
1749:   nsends = 0;
1750:   for (i=0; i<nx; i++) {
1751:     idx = bs*inidx[i];
1752:     if (idx < owners[j]) j = 0;
1753:     for (; j<size; j++) {
1754:       if (idx < owners[j+1]) {
1755:         if (!nprocs[j]++) nsends++;
1756:         owner[i] = j;
1757:         break;
1758:       }
1759:     }
1760:     if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
1761:   }

1763:   nprocslocal  = nprocs[rank];
1764:   nprocs[rank] = 0;
1765:   if (nprocslocal) nsends--;
1766:   /* inform other processors of number of messages and max length*/
1767:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
1768:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
1769:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
1770:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];

1772:   /* post receives:   */
1773:   PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);
1774:   count = 0;
1775:   for (i=0; i<nrecvs; i++) {
1776:     MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
1777:     count += olengths1[i];
1778:   }

1780:   /* do sends:
1781:      1) starts[i] gives the starting index in svalues for stuff going to
1782:      the ith processor
1783:   */
1784:   PetscMalloc3(nx,&svalues,nsends,&send_waits,size+1,&starts);

1786:   starts[0]  = 0;
1787:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
1788:   for (i=0; i<nx; i++) {
1789:     if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
1790:   }
1791:   starts[0] = 0;
1792:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
1793:   count = 0;
1794:   for (i=0; i<size; i++) {
1795:     if (nprocs[i]) {
1796:       MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);
1797:     }
1798:   }

1800:   /*  wait on receives */
1801:   count = nrecvs;
1802:   slen  = 0;
1803:   while (count) {
1804:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1805:     /* unpack receives into our local space */
1806:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1807:     slen += n;
1808:     count--;
1809:   }

1811:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);

1813:   /* allocate entire send scatter context */
1814:   PetscNewLog(ctx,&to);
1815:   to->n = nrecvs;

1817:   PetscMalloc1(nrecvs,&to->requests);
1818:   PetscMalloc4(bs*slen,&to->values,slen,&to->indices,nrecvs+1,&to->starts,nrecvs,&to->procs);
1819:   PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);

1821:   ctx->todata   = (void*)to;
1822:   to->starts[0] = 0;

1824:   if (nrecvs) {
1825:     /* move the data into the send scatter */
1826:     base     = owners[rank];
1827:     rsvalues = rvalues;
1828:     for (i=0; i<nrecvs; i++) {
1829:       to->starts[i+1] = to->starts[i] + olengths1[i];
1830:       to->procs[i]    = onodes1[i];
1831:       values = rsvalues;
1832:       rsvalues += olengths1[i];
1833:       for (j=0; j<olengths1[i]; j++) to->indices[to->starts[i] + j] = values[j] - base;
1834:     }
1835:   }
1836:   PetscFree(olengths1);
1837:   PetscFree(onodes1);
1838:   PetscFree3(rvalues,source,recv_waits);

1840:   /* allocate entire receive scatter context */
1841:   PetscNewLog(ctx,&from);
1842:   from->n = nsends;

1844:   PetscMalloc1(nsends,&from->requests);
1845:   PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);
1846:   ctx->fromdata = (void*)from;

1848:   /* move data into receive scatter */
1849:   PetscMalloc2(size,&lowner,nsends+1,&start);
1850:   count = 0; from->starts[0] = start[0] = 0;
1851:   for (i=0; i<size; i++) {
1852:     if (nprocs[i]) {
1853:       lowner[i]            = count;
1854:       from->procs[count++] = i;
1855:       from->starts[count]  = start[count] = start[count-1] + nprocs[i];
1856:     }
1857:   }

1859:   for (i=0; i<nx; i++) {
1860:     if (owner[i] != rank) {
1861:       from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
1862:       if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1863:     }
1864:   }
1865:   PetscFree2(lowner,start);
1866:   PetscFree2(nprocs,owner);

1868:   /* wait on sends */
1869:   if (nsends) {
1870:     PetscMalloc1(nsends,&send_status);
1871:     MPI_Waitall(nsends,send_waits,send_status);
1872:     PetscFree(send_status);
1873:   }
1874:   PetscFree3(svalues,send_waits,starts);

1876:   if (nprocslocal) {
1877:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1878:     /* we have a scatter to ourselves */
1879:     PetscMalloc1(nt,&to->local.vslots);
1880:     PetscMalloc1(nt,&from->local.vslots);
1881:     nt   = 0;
1882:     for (i=0; i<nx; i++) {
1883:       idx = bs*inidx[i];
1884:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1885:         to->local.vslots[nt]     = idx - owners[rank];
1886:         from->local.vslots[nt++] = bs*inidy[i];
1887:         if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1888:       }
1889:     }
1890:   } else {
1891:     from->local.n      = 0;
1892:     from->local.vslots = 0;
1893:     to->local.n        = 0;
1894:     to->local.vslots   = 0;
1895:   }

1897:   from->local.nonmatching_computed = PETSC_FALSE;
1898:   from->local.n_nonmatching        = 0;
1899:   from->local.slots_nonmatching    = 0;
1900:   to->local.nonmatching_computed   = PETSC_FALSE;
1901:   to->local.n_nonmatching          = 0;
1902:   to->local.slots_nonmatching      = 0;

1904:   from->type = VEC_SCATTER_MPI_GENERAL;
1905:   to->type   = VEC_SCATTER_MPI_GENERAL;
1906:   from->bs   = bs;
1907:   to->bs     = bs;

1909:   VecScatterCreateCommon_PtoS(from,to,ctx);
1910:   return(0);
1911: }

1913: /*
1914:    bs indicates how many elements there are in each block. Normally this would be 1.
1915: */
1918: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1919: {
1920:   MPI_Comm       comm;
1921:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
1922:   PetscInt       bs   = to->bs;
1923:   PetscMPIInt    size;
1924:   PetscInt       i, n;

1928:   PetscObjectGetComm((PetscObject)ctx,&comm);
1929:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1930:   ctx->destroy = VecScatterDestroy_PtoP;

1932:   ctx->reproduce = PETSC_FALSE;
1933:   to->sendfirst  = PETSC_FALSE;
1934:   PetscOptionsGetBool(NULL,"-vecscatter_reproduce",&ctx->reproduce,NULL);
1935:   PetscOptionsGetBool(NULL,"-vecscatter_sendfirst",&to->sendfirst,NULL);
1936:   from->sendfirst = to->sendfirst;

1938:   MPI_Comm_size(comm,&size);
1939:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1940:   to->contiq = PETSC_FALSE;
1941:   n = from->starts[from->n];
1942:   from->contiq = PETSC_TRUE;
1943:   for (i=1; i<n; i++) {
1944:     if (from->indices[i] != from->indices[i-1] + bs) {
1945:       from->contiq = PETSC_FALSE;
1946:       break;
1947:     }
1948:   }

1950:   to->use_alltoallv = PETSC_FALSE;
1951:   PetscOptionsGetBool(NULL,"-vecscatter_alltoall",&to->use_alltoallv,NULL);
1952:   from->use_alltoallv = to->use_alltoallv;
1953:   if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1954: #if defined(PETSC_HAVE_MPI_ALLTOALLW)  && !defined(PETSC_USE_64BIT_INDICES)
1955:   if (to->use_alltoallv) {
1956:     to->use_alltoallw = PETSC_FALSE;
1957:     PetscOptionsGetBool(NULL,"-vecscatter_nopack",&to->use_alltoallw,NULL);
1958:   }
1959:   from->use_alltoallw = to->use_alltoallw;
1960:   if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1961: #endif

1963: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1964:   to->use_window = PETSC_FALSE;
1965:   PetscOptionsGetBool(NULL,"-vecscatter_window",&to->use_window,NULL);
1966:   from->use_window = to->use_window;
1967: #endif

1969:   if (to->use_alltoallv) {

1971:     PetscMalloc2(size,&to->counts,size,&to->displs);
1972:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1973:     for (i=0; i<to->n; i++) to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);

1975:     to->displs[0] = 0;
1976:     for (i=1; i<size; i++) to->displs[i] = to->displs[i-1] + to->counts[i-1];

1978:     PetscMalloc2(size,&from->counts,size,&from->displs);
1979:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1980:     for (i=0; i<from->n; i++) from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1981:     from->displs[0] = 0;
1982:     for (i=1; i<size; i++) from->displs[i] = from->displs[i-1] + from->counts[i-1];

1984: #if defined(PETSC_HAVE_MPI_ALLTOALLW) && !defined(PETSC_USE_64BIT_INDICES)
1985:     if (to->use_alltoallw) {
1986:       PetscMPIInt mpibs, mpilen;

1988:       ctx->packtogether = PETSC_FALSE;
1989:       PetscMPIIntCast(bs,&mpibs);
1990:       PetscMalloc3(size,&to->wcounts,size,&to->wdispls,size,&to->types);
1991:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1992:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1993:       for (i=0; i<size; i++) to->types[i] = MPIU_SCALAR;

1995:       for (i=0; i<to->n; i++) {
1996:         to->wcounts[to->procs[i]] = 1;
1997:         PetscMPIIntCast(to->starts[i+1]-to->starts[i],&mpilen);
1998:         MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1999:         MPI_Type_commit(to->types+to->procs[i]);
2000:       }
2001:       PetscMalloc3(size,&from->wcounts,size,&from->wdispls,size,&from->types);
2002:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
2003:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
2004:       for (i=0; i<size; i++) from->types[i] = MPIU_SCALAR;

2006:       if (from->contiq) {
2007:         PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
2008:         for (i=0; i<from->n; i++) from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);

2010:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
2011:         for (i=1; i<from->n; i++) from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];

2013:       } else {
2014:         for (i=0; i<from->n; i++) {
2015:           from->wcounts[from->procs[i]] = 1;
2016:           PetscMPIIntCast(from->starts[i+1]-from->starts[i],&mpilen);
2017:           MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
2018:           MPI_Type_commit(from->types+from->procs[i]);
2019:         }
2020:       }
2021:     } else ctx->copy = VecScatterCopy_PtoP_AllToAll;

2023: #else
2024:     to->use_alltoallw   = PETSC_FALSE;
2025:     from->use_alltoallw = PETSC_FALSE;
2026:     ctx->copy           = VecScatterCopy_PtoP_AllToAll;
2027: #endif
2028: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
2029:   } else if (to->use_window) {
2030:     PetscMPIInt temptag,winsize;
2031:     MPI_Request *request;
2032:     MPI_Status  *status;

2034:     PetscObjectGetNewTag((PetscObject)ctx,&temptag);
2035:     winsize = (to->n ? to->starts[to->n] : 0)*bs*sizeof(PetscScalar);
2036:     MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
2037:     PetscMalloc1(to->n,&to->winstarts);
2038:     PetscMalloc2(to->n,&request,to->n,&status);
2039:     for (i=0; i<to->n; i++) {
2040:       MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
2041:     }
2042:     for (i=0; i<from->n; i++) {
2043:       MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
2044:     }
2045:     MPI_Waitall(to->n,request,status);
2046:     PetscFree2(request,status);

2048:     winsize = (from->n ? from->starts[from->n] : 0)*bs*sizeof(PetscScalar);
2049:     MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
2050:     PetscMalloc1(from->n,&from->winstarts);
2051:     PetscMalloc2(from->n,&request,from->n,&status);
2052:     for (i=0; i<from->n; i++) {
2053:       MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
2054:     }
2055:     for (i=0; i<to->n; i++) {
2056:       MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
2057:     }
2058:     MPI_Waitall(from->n,request,status);
2059:     PetscFree2(request,status);
2060: #endif
2061:   } else {
2062:     PetscBool   use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
2063:     PetscInt    *sstarts  = to->starts,  *rstarts = from->starts;
2064:     PetscMPIInt *sprocs   = to->procs,   *rprocs  = from->procs;
2065:     MPI_Request *swaits   = to->requests,*rwaits  = from->requests;
2066:     MPI_Request *rev_swaits,*rev_rwaits;
2067:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

2069:     /* allocate additional wait variables for the "reverse" scatter */
2070:     PetscMalloc1(to->n,&rev_rwaits);
2071:     PetscMalloc1(from->n,&rev_swaits);
2072:     to->rev_requests   = rev_rwaits;
2073:     from->rev_requests = rev_swaits;

2075:     /* Register the receives that you will use later (sends for scatter reverse) */
2076:     PetscOptionsGetBool(NULL,"-vecscatter_rsend",&use_rsend,NULL);
2077:     PetscOptionsGetBool(NULL,"-vecscatter_ssend",&use_ssend,NULL);
2078:     if (use_rsend) {
2079:       PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
2080:       to->use_readyreceiver   = PETSC_TRUE;
2081:       from->use_readyreceiver = PETSC_TRUE;
2082:     } else {
2083:       to->use_readyreceiver   = PETSC_FALSE;
2084:       from->use_readyreceiver = PETSC_FALSE;
2085:     }
2086:     if (use_ssend) {
2087:       PetscInfo(ctx,"Using VecScatter Ssend mode\n");
2088:     }

2090:     for (i=0; i<from->n; i++) {
2091:       if (use_rsend) {
2092:         MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2093:       } else if (use_ssend) {
2094:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2095:       } else {
2096:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
2097:       }
2098:     }

2100:     for (i=0; i<to->n; i++) {
2101:       if (use_rsend) {
2102:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2103:       } else if (use_ssend) {
2104:         MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2105:       } else {
2106:         MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
2107:       }
2108:     }
2109:     /* Register receives for scatter and reverse */
2110:     for (i=0; i<from->n; i++) {
2111:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
2112:     }
2113:     for (i=0; i<to->n; i++) {
2114:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
2115:     }
2116:     if (use_rsend) {
2117:       if (to->n)   {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
2118:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
2119:       MPI_Barrier(comm);
2120:     }

2122:     ctx->copy = VecScatterCopy_PtoP_X;
2123:   }
2124:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);

2126: #if defined(PETSC_USE_DEBUG)
2127:   MPI_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));
2128:   MPI_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));
2129:   if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2130: #endif

2132:   switch (bs) {
2133:   case 12:
2134:     ctx->begin = VecScatterBegin_12;
2135:     ctx->end   = VecScatterEnd_12;
2136:     break;
2137:   case 8:
2138:     ctx->begin = VecScatterBegin_8;
2139:     ctx->end   = VecScatterEnd_8;
2140:     break;
2141:   case 7:
2142:     ctx->begin = VecScatterBegin_7;
2143:     ctx->end   = VecScatterEnd_7;
2144:     break;
2145:   case 6:
2146:     ctx->begin = VecScatterBegin_6;
2147:     ctx->end   = VecScatterEnd_6;
2148:     break;
2149:   case 5:
2150:     ctx->begin = VecScatterBegin_5;
2151:     ctx->end   = VecScatterEnd_5;
2152:     break;
2153:   case 4:
2154:     ctx->begin = VecScatterBegin_4;
2155:     ctx->end   = VecScatterEnd_4;
2156:     break;
2157:   case 3:
2158:     ctx->begin = VecScatterBegin_3;
2159:     ctx->end   = VecScatterEnd_3;
2160:     break;
2161:   case 2:
2162:     ctx->begin = VecScatterBegin_2;
2163:     ctx->end   = VecScatterEnd_2;
2164:     break;
2165:   case 1:
2166:     ctx->begin = VecScatterBegin_1;
2167:     ctx->end   = VecScatterEnd_1;
2168:     break;
2169:   default:
2170:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Blocksize not supported");
2171:   }
2172:   ctx->view = VecScatterView_MPI;
2173:   /* Check if the local scatter is actually a copy; important special case */
2174:   if (to->local.n) {
2175:     VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
2176:   }
2177:   return(0);
2178: }



2182: /* ------------------------------------------------------------------------------------*/
2183: /*
2184:          Scatter from local Seq vectors to a parallel vector.
2185:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
2186:          reverses the result.
2187: */
2190: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2191: {
2192:   PetscErrorCode         ierr;
2193:   MPI_Request            *waits;
2194:   VecScatter_MPI_General *to,*from;

2197:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
2198:   to            = (VecScatter_MPI_General*)ctx->fromdata;
2199:   from          = (VecScatter_MPI_General*)ctx->todata;
2200:   ctx->todata   = (void*)to;
2201:   ctx->fromdata = (void*)from;
2202:   /* these two are special, they are ALWAYS stored in to struct */
2203:   to->sstatus   = from->sstatus;
2204:   to->rstatus   = from->rstatus;

2206:   from->sstatus = 0;
2207:   from->rstatus = 0;

2209:   waits              = from->rev_requests;
2210:   from->rev_requests = from->requests;
2211:   from->requests     = waits;
2212:   waits              = to->rev_requests;
2213:   to->rev_requests   = to->requests;
2214:   to->requests       = waits;
2215:   return(0);
2216: }

2218: /* ---------------------------------------------------------------------------------*/
2221: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2222: {
2224:   PetscMPIInt    size,rank,tag,imdex,n;
2225:   PetscInt       *owners = xin->map->range;
2226:   PetscMPIInt    *nprocs = NULL;
2227:   PetscInt       i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2228:   PetscInt       *owner   = NULL,*starts  = NULL,count,slen;
2229:   PetscInt       *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2230:   PetscMPIInt    *onodes1,*olengths1,nrecvs;
2231:   MPI_Comm       comm;
2232:   MPI_Request    *send_waits = NULL,*recv_waits = NULL;
2233:   MPI_Status     recv_status,*send_status = NULL;
2234:   PetscBool      duplicate = PETSC_FALSE;
2235: #if defined(PETSC_USE_DEBUG)
2236:   PetscBool      found = PETSC_FALSE;
2237: #endif

2240:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2241:   PetscObjectGetComm((PetscObject)xin,&comm);
2242:   MPI_Comm_size(comm,&size);
2243:   MPI_Comm_rank(comm,&rank);
2244:   if (size == 1) {
2245:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,bs,ctx);
2246:     return(0);
2247:   }

2249:   /*
2250:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2251:      They then call the StoPScatterCreate()
2252:   */
2253:   /*  first count number of contributors to each processor */
2254:   PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);
2255:   PetscMemzero(nprocs,size*sizeof(PetscMPIInt));

2257:   lastidx = -1;
2258:   j       = 0;
2259:   for (i=0; i<nx; i++) {
2260:     /* if indices are NOT locally sorted, need to start search at the beginning */
2261:     if (lastidx > (idx = bs*inidx[i])) j = 0;
2262:     lastidx = idx;
2263:     for (; j<size; j++) {
2264:       if (idx >= owners[j] && idx < owners[j+1]) {
2265:         nprocs[j]++;
2266:         owner[i] = j;
2267: #if defined(PETSC_USE_DEBUG)
2268:         found = PETSC_TRUE;
2269: #endif
2270:         break;
2271:       }
2272:     }
2273: #if defined(PETSC_USE_DEBUG)
2274:     if (!found) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2275:     found = PETSC_FALSE;
2276: #endif
2277:   }
2278:   nsends = 0;
2279:   for (i=0; i<size; i++) nsends += (nprocs[i] > 0);

2281:   /* inform other processors of number of messages and max length*/
2282:   PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);
2283:   PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);
2284:   PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);
2285:   recvtotal = 0; for (i=0; i<nrecvs; i++) recvtotal += olengths1[i];

2287:   /* post receives:   */
2288:   PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);

2290:   count = 0;
2291:   for (i=0; i<nrecvs; i++) {
2292:     MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);
2293:     count += olengths1[i];
2294:   }
2295:   PetscFree(onodes1);

2297:   /* do sends:
2298:       1) starts[i] gives the starting index in svalues for stuff going to
2299:          the ith processor
2300:   */
2301:   starts[0]= 0;
2302:   for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2303:   for (i=0; i<nx; i++) {
2304:     svalues[2*starts[owner[i]]]       = bs*inidx[i];
2305:     svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2306:   }

2308:   starts[0] = 0;
2309:   for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2310:   count = 0;
2311:   for (i=0; i<size; i++) {
2312:     if (nprocs[i]) {
2313:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);
2314:       count++;
2315:     }
2316:   }
2317:   PetscFree3(nprocs,owner,starts);

2319:   /*  wait on receives */
2320:   count = nrecvs;
2321:   slen  = 0;
2322:   while (count) {
2323:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2324:     /* unpack receives into our local space */
2325:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2326:     slen += n/2;
2327:     count--;
2328:   }
2329:   if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);

2331:   PetscMalloc2(slen,&local_inidx,slen,&local_inidy);
2332:   base     = owners[rank];
2333:   count    = 0;
2334:   rsvalues = rvalues;
2335:   for (i=0; i<nrecvs; i++) {
2336:     values    = rsvalues;
2337:     rsvalues += 2*olengths1[i];
2338:     for (j=0; j<olengths1[i]; j++) {
2339:       local_inidx[count]   = values[2*j] - base;
2340:       local_inidy[count++] = values[2*j+1];
2341:     }
2342:   }
2343:   PetscFree(olengths1);

2345:   /* wait on sends */
2346:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2347:   PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);

2349:   /*
2350:      should sort and remove duplicates from local_inidx,local_inidy
2351:   */

2353: #if defined(do_it_slow)
2354:   /* sort on the from index */
2355:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2356:   start = 0;
2357:   while (start < slen) {
2358:     count = start+1;
2359:     last  = local_inidx[start];
2360:     while (count < slen && last == local_inidx[count]) count++;
2361:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2362:       /* sort on to index */
2363:       PetscSortInt(count-start,local_inidy+start);
2364:     }
2365:     /* remove duplicates; not most efficient way, but probably good enough */
2366:     i = start;
2367:     while (i < count-1) {
2368:       if (local_inidy[i] != local_inidy[i+1]) i++;
2369:       else { /* found a duplicate */
2370:         duplicate = PETSC_TRUE;
2371:         for (j=i; j<slen-1; j++) {
2372:           local_inidx[j] = local_inidx[j+1];
2373:           local_inidy[j] = local_inidy[j+1];
2374:         }
2375:         slen--;
2376:         count--;
2377:       }
2378:     }
2379:     start = count;
2380:   }
2381: #endif
2382:   if (duplicate) {
2383:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2384:   }
2385:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);
2386:   PetscFree2(local_inidx,local_inidy);
2387:   return(0);
2388: }