Actual source code: vpscat.c

  1: /*$Id: vpscat.c,v 1.159 2001/04/10 19:34:51 bsmith Exp $*/
  2: /*
  3:     Defines parallel vector scatters.
  4: */

  6: #include "petscsys.h"
  7: #include "src/vec/is/isimpl.h"
  8: #include "src/vec/vecimpl.h"                     /*I "petscvec.h" I*/
  9: #include "src/vec/impls/dvecimpl.h"
 10: #include "src/vec/impls/mpi/pvecimpl.h"

 12: int VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 13: {
 14:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 15:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 16:   int                    i,rank,ierr;
 17:   PetscViewerFormat      format;
 18:   PetscTruth             isascii;

 21:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&isascii);
 22:   if (isascii) {
 23:     MPI_Comm_rank(ctx->comm,&rank);
 24:     PetscViewerGetFormat(viewer,&format);
 25:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 26:       int nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 28:       MPI_Reduce(&to->n,&nsend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 29:       MPI_Reduce(&from->n,&nrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 30:       itmp = to->starts[to->n+1];
 31:       MPI_Reduce(&itmp,&lensend_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 32:       itmp = from->starts[from->n+1];
 33:       MPI_Reduce(&itmp,&lenrecv_max,1,MPI_INT,MPI_MAX,0,ctx->comm);
 34:       MPI_Reduce(&itmp,&alldata,1,MPI_INT,MPI_SUM,0,ctx->comm);

 36:       PetscViewerASCIIPrintf(viewer,"VecScatter statisticsn");
 37:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %dn",nsend_max);
 38:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %dn",nrecv_max);
 39:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %dn",lensend_max*to->bs*sizeof(Scalar));
 40:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %dn",lenrecv_max*to->bs*sizeof(Scalar));
 41:       PetscViewerASCIIPrintf(viewer,"  Total data sent %dn",alldata*to->bs*sizeof(Scalar));

 43:     } else {
 44:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %d; Number to self = %dn",rank,to->n,to->local.n);
 45:       for (i=0; i<to->n; i++){
 46:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %d length = %d to whom %dn",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 47:       }

 49:       PetscViewerASCIISynchronizedPrintf(viewer,"Now the indicesn");
 50:       for (i=0; i<to->starts[to->n]; i++){
 51:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,to->indices[i]);
 52:       }

 54:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %d; Number from self = %dn",rank,from->n,from->local.n);
 55:       for (i=0; i<from->n; i++){
 56:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d length %d from whom %dn",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 57:       }

 59:       PetscViewerASCIISynchronizedPrintf(viewer,"Now the indicesn");
 60:       for (i=0; i<from->starts[from->n]; i++){
 61:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%d n",rank,from->indices[i]);
 62:       }

 64:       PetscViewerFlush(viewer);
 65:     }
 66:   } else {
 67:     SETERRQ1(1,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 68:   }
 69:   return(0);
 70: }

 72: /* -----------------------------------------------------------------------------------*/
 73: /*
 74:       The next routine determines what part of  the local part of the scatter is an
 75:   exact copy of values into their current location. We check this here and
 76:   then know that we need not perform that portion of the scatter.
 77: */
 78: int VecScatterLocalOptimize_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from)
 79: {
 80:   int n = gen_to->n,n_nonmatching = 0,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
 81:   int *nto_slots,*nfrom_slots,j = 0,ierr;
 82: 
 84:   for (i=0; i<n; i++) {
 85:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
 86:   }

 88:   if (!n_nonmatching) {
 89:     gen_to->nonmatching_computed = PETSC_TRUE;
 90:     gen_to->n_nonmatching        = gen_from->n_nonmatching = 0;
 91:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to 0n");
 92:   } else if (n_nonmatching == n) {
 93:     gen_to->nonmatching_computed = PETSC_FALSE;
 94:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:All values non-matchingn");
 95:   } else {
 96:     gen_to->nonmatching_computed= PETSC_TRUE;
 97:     gen_to->n_nonmatching       = gen_from->n_nonmatching = n_nonmatching;
 98:     PetscMalloc(n_nonmatching*sizeof(int),&nto_slots);
 99:     gen_to->slots_nonmatching   = nto_slots;
100:     PetscMalloc(n_nonmatching*sizeof(int),&nfrom_slots);
101:     gen_from->slots_nonmatching = nfrom_slots;
102:     for (i=0; i<n; i++) {
103:       if (to_slots[i] != from_slots[i]) {
104:         nto_slots[j]   = to_slots[i];
105:         nfrom_slots[j] = from_slots[i];
106:         j++;
107:       }
108:     }
109:     PetscLogInfo(0,"VecScatterLocalOptimize_Private:Reduced %d to %dn",n,n_nonmatching);
110:   }
111:   return(0);
112: }

114: /* --------------------------------------------------------------------------------------*/
115: int VecScatterCopy_PtoP(VecScatter in,VecScatter out)
116: {
117:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
118:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
119:   int                    len,ny,ierr;

122:   out->postrecvs = in->postrecvs;
123:   out->begin     = in->begin;
124:   out->end       = in->end;
125:   out->copy      = in->copy;
126:   out->destroy   = in->destroy;
127:   out->view      = in->view;

129:   /* allocate entire send scatter context */
130:   PetscNew(VecScatter_MPI_General,&out_to);
131:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
132:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
133:   ny                = in_to->starts[in_to->n];
134:   len               = ny*(sizeof(int) + sizeof(Scalar)) + (in_to->n+1)*sizeof(int) +
135:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
136:   out_to->n         = in_to->n;
137:   out_to->type      = in_to->type;
138:   out_to->sendfirst = in_to->sendfirst;
139:   PetscMalloc(len,&out_to->values);
140:   PetscLogObjectMemory(out,len);
141:   out_to->requests  = (MPI_Request*)(out_to->values + ny);
142:   out_to->indices   = (int*)(out_to->requests + out_to->n);
143:   out_to->starts    = (int*)(out_to->indices + ny);
144:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
145:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
146:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
147:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
148:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
149:   out_to->rstatus = out_to->rstatus + PetscMax(in_to->n,in_from->n) + 1;
150:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n) + 1)*sizeof(MPI_Status));
151:   out->todata      = (void*)out_to;
152:   out_to->local.n  = in_to->local.n;
153:   out_to->local.nonmatching_computed = PETSC_FALSE;
154:   out_to->local.n_nonmatching        = 0;
155:   out_to->local.slots_nonmatching    = 0;
156:   if (in_to->local.n) {
157:     PetscMalloc(in_to->local.n*sizeof(int),&out_to->local.slots);
158:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
159:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
160:   } else {
161:     out_to->local.slots = 0;
162:   }

164:   /* allocate entire receive context */
165:   PetscNew(VecScatter_MPI_General,&out_from);
166:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
167:   out_from->type      = in_from->type;
168:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
169:   ny                  = in_from->starts[in_from->n];
170:   len                 = ny*(sizeof(int) + sizeof(Scalar)) + (in_from->n+1)*sizeof(int) +
171:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
172:   out_from->n         = in_from->n;
173:   out_from->sendfirst = in_from->sendfirst;
174:   PetscMalloc(len,&out_from->values);
175:   PetscLogObjectMemory(out,len);
176:   out_from->requests  = (MPI_Request*)(out_from->values + ny);
177:   out_from->indices   = (int*)(out_from->requests + out_from->n);
178:   out_from->starts    = (int*)(out_from->indices + ny);
179:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
180:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
181:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
182:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
183:   out->fromdata       = (void*)out_from;
184:   out_from->local.n   = in_from->local.n;
185:   out_from->local.nonmatching_computed = PETSC_FALSE;
186:   out_from->local.n_nonmatching        = 0;
187:   out_from->local.slots_nonmatching    = 0;
188:   if (in_from->local.n) {
189:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
190:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
191:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
192:   } else {
193:     out_from->local.slots = 0;
194:   }
195:   return(0);
196: }

198: /* -------------------------------------------------------------------------------------*/
199: int VecScatterDestroy_PtoP(VecScatter ctx)
200: {
201:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
202:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

206:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
207:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
208:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
209:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}
210:   PetscFree(gen_to->sstatus);
211:   PetscFree(gen_to->values);
212:   PetscFree(gen_to);
213:   PetscFree(gen_from->values);
214:   PetscFree(gen_from);
215:   PetscLogObjectDestroy(ctx);
216:   PetscHeaderDestroy(ctx);
217:   return(0);
218: }

220: /* --------------------------------------------------------------------------------------*/
221: /*
222:      Even though the next routines are written with parallel 
223:   vectors, either xin or yin (but not both) may be Seq
224:   vectors, one for each processor.
225:   
226:      gen_from indices indicate where arriving stuff is stashed
227:      gen_to   indices indicate where departing stuff came from. 
228:      the naming can be VERY confusing.

230: */
231: int VecScatterBegin_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
232: {
233:   VecScatter_MPI_General *gen_to,*gen_from;
234:   MPI_Comm               comm = ctx->comm;
235:   Scalar                 *xv,*yv,*val,*rvalues,*svalues;
236:   MPI_Request            *rwaits,*swaits;
237:   int                    tag = ctx->tag,i,j,*indices,*rstarts,*sstarts,*rprocs,*sprocs;
238:   int                    nrecvs,nsends,iend,ierr;

241:   VecGetArray(xin,&xv);
242:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
243:   if (mode & SCATTER_REVERSE){
244:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
245:     gen_from = (VecScatter_MPI_General*)ctx->todata;
246:   } else {
247:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
248:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
249:   }
250:   rvalues  = gen_from->values;
251:   svalues  = gen_to->values;
252:   nrecvs   = gen_from->n;
253:   nsends   = gen_to->n;
254:   rwaits   = gen_from->requests;
255:   swaits   = gen_to->requests;
256:   indices  = gen_to->indices;
257:   rstarts  = gen_from->starts;
258:   sstarts  = gen_to->starts;
259:   rprocs   = gen_from->procs;
260:   sprocs   = gen_to->procs;

262:   if (!(mode & SCATTER_LOCAL)) {

264:     if (gen_to->sendfirst) {
265:       /* do sends:  */
266:       for (i=0; i<nsends; i++) {
267:         val  = svalues + sstarts[i];
268:         iend = sstarts[i+1]-sstarts[i];
269:         /* pack the message */
270:         for (j=0; j<iend; j++) {
271:           val[j] = xv[*indices++];
272:         }
273:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
274:       }
275:     }
276: 
277:     /* post receives:   */
278:     for (i=0; i<nrecvs; i++) {
279:       MPI_Irecv(rvalues+rstarts[i],rstarts[i+1]-rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
280:     }

282:     if (!gen_to->sendfirst) {
283:       /* do sends:  */
284:       for (i=0; i<nsends; i++) {
285:         val  = svalues + sstarts[i];
286:         iend = sstarts[i+1]-sstarts[i];
287:         /* pack the message */
288:         for (j=0; j<iend; j++) {
289:           val[j] = xv[*indices++];
290:         }
291:         MPI_Isend(val,iend,MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
292:       }
293:     }
294:   }

296:   /* take care of local scatters */
297:   if (gen_to->local.n && addv == INSERT_VALUES) {
298:     if (yv == xv && !gen_to->local.nonmatching_computed) {
299:       VecScatterLocalOptimize_Private(&gen_to->local,&gen_from->local);
300:     }
301:     if (gen_to->local.is_copy) {
302:       PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
303:     } else if (yv != xv || gen_to->local.nonmatching_computed == -1) {
304:       int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
305:       int n       = gen_to->local.n;
306:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
307:     } else {
308:       /* 
309:         In this case, it is copying the values into their old locations, thus we can skip those  
310:       */
311:       int *tslots = gen_to->local.slots_nonmatching,*fslots = gen_from->local.slots_nonmatching;
312:       int n       = gen_to->local.n_nonmatching;
313:       for (i=0; i<n; i++) {yv[fslots[i]] = xv[tslots[i]];}
314:     }
315:   } else if (gen_to->local.n) {
316:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
317:     int n = gen_to->local.n;
318:     if (addv == ADD_VALUES) {
319:       for (i=0; i<n; i++) {yv[fslots[i]] += xv[tslots[i]];}
320: #if !defined(PETSC_USE_COMPLEX)
321:     } else if (addv == MAX_VALUES) {
322:       for (i=0; i<n; i++) {yv[fslots[i]] = PetscMax(yv[fslots[i]],xv[tslots[i]]);}
323: #endif
324:     } else {SETERRQ(1,"Wrong insert option");}
325:   }

327:   VecRestoreArray(xin,&xv);
328:   if (xin != yin) {VecRestoreArray(yin,&yv);}
329:   return(0);
330: }

332: /* --------------------------------------------------------------------------------------*/
333: int VecScatterEnd_PtoP(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
334: {
335:   VecScatter_MPI_General *gen_to,*gen_from;
336:   Scalar                 *rvalues,*yv,*val;
337:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices;
338:   MPI_Request            *rwaits,*swaits;
339:   MPI_Status             rstatus,*sstatus;

342:   if (mode & SCATTER_LOCAL) return(0);
343:   VecGetArray(yin,&yv);

345:   if (mode & SCATTER_REVERSE){
346:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
347:     gen_from = (VecScatter_MPI_General*)ctx->todata;
348:     sstatus  = gen_from->sstatus;
349:   } else {
350:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
351:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
352:     sstatus  = gen_to->sstatus;
353:   }
354:   rvalues  = gen_from->values;
355:   nrecvs   = gen_from->n;
356:   nsends   = gen_to->n;
357:   rwaits   = gen_from->requests;
358:   swaits   = gen_to->requests;
359:   indices  = gen_from->indices;
360:   rstarts  = gen_from->starts;

362:   /*  wait on receives */
363:   count = nrecvs;
364:   while (count) {
365:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
366:     /* unpack receives into our local space */
367:     val      = rvalues + rstarts[imdex];
368:     n        = rstarts[imdex+1]-rstarts[imdex];
369:     lindices = indices + rstarts[imdex];
370:     if (addv == INSERT_VALUES) {
371:       for (i=0; i<n; i++) {
372:         yv[lindices[i]] = *val++;
373:       }
374:     } else if (addv == ADD_VALUES) {
375:       for (i=0; i<n; i++) {
376:         yv[lindices[i]] += *val++;
377:       }
378: #if !defined(PETSC_USE_COMPLEX)
379:     } else if (addv == MAX_VALUES) {
380:       for (i=0; i<n; i++) {
381:         yv[lindices[i]] = PetscMax(yv[lindices[i]],*val); val++;
382:       }
383: #endif
384:     }  else {SETERRQ(1,"Wrong insert option");}
385:     count--;
386:   }

388:   /* wait on sends */
389:   if (nsends) {
390:     MPI_Waitall(nsends,swaits,sstatus);
391:   }
392:   VecRestoreArray(yin,&yv);
393:   return(0);
394: }
395: /* ==========================================================================================*/
396: /*
397:     Special scatters for fixed block sizes. These provide better performance
398:     because the local copying and packing and unpacking are done with loop 
399:     unrolling to the size of the block.

401:     Also uses MPI persistent sends and receives, these (at least in theory)
402:     allow MPI to optimize repeated sends and receives of the same type.
403: */

405: /*
406:     This is for use with the "ready-receiver" mode. In theory on some
407:     machines it could lead to better performance. In practice we've never
408:     seen it give better performance. Accessed with the -vecscatter_rr flag.
409: */
410: int VecScatterPostRecvs_PtoP_X(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
411: {
412:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;

415:   MPI_Startall_irecv(gen_from->starts[gen_from->n]*gen_from->bs,gen_from->n,gen_from->requests);
416:   return(0);
417: }

419: /* --------------------------------------------------------------------------------------*/
420: /*
421:     Special optimization to see if the local part of the scatter is actually 
422:     a copy. The scatter routines call PetscMemcpy() instead.
423:  
424: */
425: int VecScatterLocalOptimizeCopy_Private(VecScatter_Seq_General *gen_to,VecScatter_Seq_General *gen_from,int bs)
426: {
427:   int n = gen_to->n,i,*to_slots = gen_to->slots,*from_slots = gen_from->slots;
428:   int to_start,from_start;
429: 
431:   to_start   = to_slots[0];
432:   from_start = from_slots[0];

434:   for (i=1; i<n; i++) {
435:     to_start   += bs;
436:     from_start += bs;
437:     if (to_slots[i]   != to_start)   return(0);
438:     if (from_slots[i] != from_start) return(0);
439:   }
440:   gen_to->is_copy       = PETSC_TRUE;
441:   gen_to->copy_start    = to_slots[0];
442:   gen_to->copy_length   = bs*sizeof(Scalar)*n;
443:   gen_from->is_copy     = PETSC_TRUE;
444:   gen_from->copy_start  = from_slots[0];
445:   gen_from->copy_length = bs*sizeof(Scalar)*n;

447:   PetscLogInfo(0,"VecScatterLocalOptimizeCopy_Private:Local scatter is a copy, optimizing for itn");

449:   return(0);
450: }

452: /* --------------------------------------------------------------------------------------*/

454: int VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
455: {
456:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
457:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
458:   int                    len,ny,bs = in_from->bs,ierr;

461:   out->postrecvs = in->postrecvs;
462:   out->begin     = in->begin;
463:   out->end       = in->end;
464:   out->copy      = in->copy;
465:   out->destroy   = in->destroy;
466:   out->view      = in->view;

468:   /* allocate entire send scatter context */
469:   PetscNew(VecScatter_MPI_General,&out_to);
470:   PetscMemzero(out_to,sizeof(VecScatter_MPI_General));
471:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
472:   ny                = in_to->starts[in_to->n];
473:   len               = ny*(sizeof(int) + bs*sizeof(Scalar)) + (in_to->n+1)*sizeof(int) +
474:                      (in_to->n)*(sizeof(int) + sizeof(MPI_Request));
475:   out_to->n         = in_to->n;
476:   out_to->type      = in_to->type;
477:   out_to->sendfirst = in_to->sendfirst;

479:   PetscMalloc(len,&out_to->values);
480:   PetscLogObjectMemory(out,len);
481:   out_to->requests  = (MPI_Request*)(out_to->values + bs*ny);
482:   out_to->indices   = (int*)(out_to->requests + out_to->n);
483:   out_to->starts    = (int*)(out_to->indices + ny);
484:   out_to->procs     = (int*)(out_to->starts + out_to->n + 1);
485:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(int));
486:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(int));
487:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(int));
488:   PetscMalloc(2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status),&out_to->sstatus);
489:   out_to->rstatus   =  out_to->sstatus + PetscMax(in_to->n,in_from->n) + 1;
490: 
491:   PetscLogObjectMemory(out,2*(PetscMax(in_to->n,in_from->n)+1)*sizeof(MPI_Status));
492:   out->todata       = (void*)out_to;
493:   out_to->local.n   = in_to->local.n;
494:   out_to->local.nonmatching_computed = PETSC_FALSE;
495:   out_to->local.n_nonmatching        = 0;
496:   out_to->local.slots_nonmatching    = 0;
497:   if (in_to->local.n) {
498:     PetscMalloc(in_to->local.n*sizeof(int),out_to->local.slots);
499:     PetscMemcpy(out_to->local.slots,in_to->local.slots,in_to->local.n*sizeof(int));
500:     PetscLogObjectMemory(out,in_to->local.n*sizeof(int));
501:   } else {
502:     out_to->local.slots = 0;
503:   }

505:   /* allocate entire receive context */
506:   PetscNew(VecScatter_MPI_General,&out_from);
507:   PetscMemzero(out_from,sizeof(VecScatter_MPI_General));
508:   out_from->type      = in_from->type;
509:   PetscLogObjectMemory(out,sizeof(VecScatter_MPI_General));
510:   ny                  = in_from->starts[in_from->n];
511:   len                 = ny*(sizeof(int) + bs*sizeof(Scalar)) + (in_from->n+1)*sizeof(int) +
512:                        (in_from->n)*(sizeof(int) + sizeof(MPI_Request));
513:   out_from->n         = in_from->n;
514:   out_from->sendfirst = in_from->sendfirst;
515:   PetscMalloc(len,&out_from->values);
516:   PetscLogObjectMemory(out,len);
517:   out_from->requests  = (MPI_Request*)(out_from->values + bs*ny);
518:   out_from->indices   = (int*)(out_from->requests + out_from->n);
519:   out_from->starts    = (int*)(out_from->indices + ny);
520:   out_from->procs     = (int*)(out_from->starts + out_from->n + 1);
521:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(int));
522:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(int));
523:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(int));
524:   out->fromdata       = (void*)out_from;
525:   out_from->local.n   = in_from->local.n;
526:   out_from->local.nonmatching_computed = PETSC_FALSE;
527:   out_from->local.n_nonmatching        = 0;
528:   out_from->local.slots_nonmatching    = 0;
529:   if (in_from->local.n) {
530:     PetscMalloc(in_from->local.n*sizeof(int),&out_from->local.slots);
531:     PetscLogObjectMemory(out,in_from->local.n*sizeof(int));
532:     PetscMemcpy(out_from->local.slots,in_from->local.slots,in_from->local.n*sizeof(int));
533:   } else {
534:     out_from->local.slots = 0;
535:   }

537:   /* 
538:       set up the request arrays for use with isend_init() and irecv_init()
539:   */
540:   {
541:     MPI_Comm    comm;
542:     int         *sstarts = out_to->starts,  *rstarts = out_from->starts;
543:     int         *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
544:     int         tag,i;
545:     PetscTruth  flg;
546:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
547:     MPI_Request *rev_swaits,*rev_rwaits;
548:     Scalar      *Ssvalues = out_to->values, *Srvalues = out_from->values;

550:     PetscMalloc((in_to->n+1)*sizeof(MPI_Request),&out_to->rev_requests);
551:     PetscMalloc((in_from->n+1)*sizeof(MPI_Request),&out_from->rev_requests);
552:     PetscLogObjectMemory(out,(in_to->n+in_from->n+2)*sizeof(MPI_Request));

554:     rev_rwaits = out_to->rev_requests;
555:     rev_swaits = out_from->rev_requests;

557:     out_from->bs = out_to->bs = bs;
558:     tag     = out->tag;
559:     comm    = out->comm;

561:     /* Register the receives that you will use later (sends for scatter reverse) */
562:     for (i=0; i<out_from->n; i++) {
563:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
564:                            comm,rwaits+i);
565:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
566:                            comm,rev_swaits+i);
567:     }

569:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
570:     if (flg) {
571:       out->postrecvs               = VecScatterPostRecvs_PtoP_X;
572:       out_to->use_readyreceiver    = PETSC_TRUE;
573:       out_from->use_readyreceiver  = PETSC_TRUE;
574:       for (i=0; i<out_to->n; i++) {
575:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
576:                               comm,swaits+i);
577:       }
578:       PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter ready receiver moden");
579:     } else {
580:       out->postrecvs               = 0;
581:       out_to->use_readyreceiver    = PETSC_FALSE;
582:       out_from->use_readyreceiver  = PETSC_FALSE;
583:       flg                          = PETSC_FALSE;
584:       ierr                         = PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
585:       if (flg) {
586:         PetscLogInfo(0,"VecScatterCopy_PtoP_X:Using VecScatter Ssend moden");
587:       }
588:       for (i=0; i<out_to->n; i++) {
589:         if (!flg) {
590:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
591:                                comm,swaits+i);
592:         } else {
593:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
594:                                comm,swaits+i);
595:         }
596:       }
597:     }
598:     /* Register receives for scatter reverse */
599:     for (i=0; i<out_to->n; i++) {
600:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
601:                            comm,rev_rwaits+i);
602:     }
603:   }

605:   return(0);
606: }

608: /* --------------------------------------------------------------------------------------*/

610: int VecScatterBegin_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
611: {
612:   VecScatter_MPI_General *gen_to,*gen_from;
613:   Scalar                 *xv,*yv,*val,*svalues;
614:   MPI_Request            *rwaits,*swaits;
615:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

618:   VecGetArray(xin,&xv);
619:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

621:   if (mode & SCATTER_REVERSE) {
622:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
623:     gen_from = (VecScatter_MPI_General*)ctx->todata;
624:     rwaits   = gen_from->rev_requests;
625:     swaits   = gen_to->rev_requests;
626:   } else {
627:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
628:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
629:     rwaits   = gen_from->requests;
630:     swaits   = gen_to->requests;
631:   }
632:   svalues  = gen_to->values;
633:   nrecvs   = gen_from->n;
634:   nsends   = gen_to->n;
635:   indices  = gen_to->indices;
636:   sstarts  = gen_to->starts;

638:   if (!(mode & SCATTER_LOCAL)) {

640:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
641:       /* post receives since they were not posted in VecScatterPostRecvs()   */
642:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
643:     }
644:     if (ctx->packtogether) {
645:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
646:       len  = 12*sstarts[nsends];
647:       val  = svalues;
648:       for (i=0; i<len; i += 12) {
649:         idx     = *indices++;
650:         val[0]  = xv[idx];
651:         val[1]  = xv[idx+1];
652:         val[2]  = xv[idx+2];
653:         val[3]  = xv[idx+3];
654:         val[4]  = xv[idx+4];
655:         val[5]  = xv[idx+5];
656:         val[6]  = xv[idx+6];
657:         val[7]  = xv[idx+7];
658:         val[8]  = xv[idx+8];
659:         val[9]  = xv[idx+9];
660:         val[10] = xv[idx+10];
661:         val[11] = xv[idx+11];
662:         val    += 12;
663:       }
664:       MPI_Startall_isend(len,nsends,swaits);
665:     } else {
666:       /* this version packs and sends one at a time */
667:       val  = svalues;
668:       for (i=0; i<nsends; i++) {
669:         iend = sstarts[i+1]-sstarts[i];

671:         for (j=0; j<iend; j++) {
672:           idx     = *indices++;
673:           val[0]  = xv[idx];
674:           val[1]  = xv[idx+1];
675:           val[2]  = xv[idx+2];
676:           val[3]  = xv[idx+3];
677:           val[4]  = xv[idx+4];
678:           val[5]  = xv[idx+5];
679:           val[6]  = xv[idx+6];
680:           val[7]  = xv[idx+7];
681:           val[8]  = xv[idx+8];
682:           val[9]  = xv[idx+9];
683:           val[10] = xv[idx+10];
684:           val[11] = xv[idx+11];
685:           val    += 12;
686:         }
687:         MPI_Start_isend(12*iend,swaits+i);
688:       }
689:     }

691:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
692:       /* post receives since they were not posted in VecScatterPostRecvs()   */
693:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
694:     }
695:   }

697:   /* take care of local scatters */
698:   if (gen_to->local.n) {
699:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
700:     int n       = gen_to->local.n,il,ir;
701:     if (addv == INSERT_VALUES) {
702:       if (gen_to->local.is_copy) {
703:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
704:       } else {
705:         for (i=0; i<n; i++) {
706:           il = fslots[i]; ir = tslots[i];
707:           yv[il]    = xv[ir];
708:           yv[il+1]  = xv[ir+1];
709:           yv[il+2]  = xv[ir+2];
710:           yv[il+3]  = xv[ir+3];
711:           yv[il+4]  = xv[ir+4];
712:           yv[il+5]  = xv[ir+5];
713:           yv[il+6]  = xv[ir+6];
714:           yv[il+7]  = xv[ir+7];
715:           yv[il+8]  = xv[ir+8];
716:           yv[il+9]  = xv[ir+9];
717:           yv[il+10] = xv[ir+10];
718:           yv[il+11] = xv[ir+11];
719:         }
720:       }
721:     }  else if (addv == ADD_VALUES) {
722:       for (i=0; i<n; i++) {
723:         il = fslots[i]; ir = tslots[i];
724:         yv[il]    += xv[ir];
725:         yv[il+1]  += xv[ir+1];
726:         yv[il+2]  += xv[ir+2];
727:         yv[il+3]  += xv[ir+3];
728:         yv[il+4]  += xv[ir+4];
729:         yv[il+5]  += xv[ir+5];
730:         yv[il+6]  += xv[ir+6];
731:         yv[il+7]  += xv[ir+7];
732:         yv[il+8]  += xv[ir+8];
733:         yv[il+9]  += xv[ir+9];
734:         yv[il+10] += xv[ir+10];
735:         yv[il+11] += xv[ir+11];
736:       }
737: #if !defined(PETSC_USE_COMPLEX)
738:     }  else if (addv == MAX_VALUES) {
739:       for (i=0; i<n; i++) {
740:         il = fslots[i]; ir = tslots[i];
741:         yv[il]    = PetscMax(yv[il],xv[ir]);
742:         yv[il+1]  = PetscMax(yv[il+1],xv[ir+1]);
743:         yv[il+2]  = PetscMax(yv[il+2],xv[ir+2]);
744:         yv[il+3]  = PetscMax(yv[il+3],xv[ir+3]);
745:         yv[il+4]  = PetscMax(yv[il+4],xv[ir+4]);
746:         yv[il+5]  = PetscMax(yv[il+5],xv[ir+5]);
747:         yv[il+6]  = PetscMax(yv[il+6],xv[ir+6]);
748:         yv[il+7]  = PetscMax(yv[il+7],xv[ir+7]);
749:         yv[il+8]  = PetscMax(yv[il+8],xv[ir+8]);
750:         yv[il+9]  = PetscMax(yv[il+9],xv[ir+9]);
751:         yv[il+10] = PetscMax(yv[il+10],xv[ir+10]);
752:         yv[il+11] = PetscMax(yv[il+11],xv[ir+11]);
753:       }
754: #endif
755:     } else {SETERRQ(1,"Wrong insert option");}
756:   }
757:   VecRestoreArray(xin,&xv);
758:   if (xin != yin) {VecRestoreArray(yin,&yv);}
759:   return(0);
760: }

762: /* --------------------------------------------------------------------------------------*/

764: int VecScatterEnd_PtoP_12(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
765: {
766:   VecScatter_MPI_General *gen_to,*gen_from;
767:   Scalar                 *rvalues,*yv,*val;
768:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
769:   MPI_Request            *rwaits,*swaits;
770:   MPI_Status             *rstatus,*sstatus;

773:   if (mode & SCATTER_LOCAL) return(0);
774:   VecGetArray(yin,&yv);

776:   if (mode & SCATTER_REVERSE) {
777:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
778:     gen_from = (VecScatter_MPI_General*)ctx->todata;
779:     rwaits   = gen_from->rev_requests;
780:     swaits   = gen_to->rev_requests;
781:     sstatus  = gen_from->sstatus;
782:     rstatus  = gen_from->rstatus;
783:   } else {
784:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
785:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
786:     rwaits   = gen_from->requests;
787:     swaits   = gen_to->requests;
788:     sstatus  = gen_to->sstatus;
789:     rstatus  = gen_to->rstatus;
790:   }
791:   rvalues  = gen_from->values;
792:   nrecvs   = gen_from->n;
793:   nsends   = gen_to->n;
794:   indices  = gen_from->indices;
795:   rstarts  = gen_from->starts;

797:   /*  wait on receives */
798:   count = nrecvs;
799:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
800:     ierr     = MPI_Waitall(nrecvs,rwaits,rstatus);
801:     n        = rstarts[count];
802:     val      = rvalues;
803:     lindices = indices;
804:     if (addv == INSERT_VALUES) {
805:       for (i=0; i<n; i++) {
806:         idx        = lindices[i];
807:         yv[idx]    = val[0];
808:         yv[idx+1]  = val[1];
809:         yv[idx+2]  = val[2];
810:         yv[idx+3]  = val[3];
811:         yv[idx+4]  = val[4];
812:         yv[idx+5]  = val[5];
813:         yv[idx+6]  = val[6];
814:         yv[idx+7]  = val[7];
815:         yv[idx+8]  = val[8];
816:         yv[idx+9]  = val[9];
817:         yv[idx+10] = val[10];
818:         yv[idx+11] = val[11];
819:         val       += 12;
820:       }
821:     } else if (addv == ADD_VALUES) {
822:       for (i=0; i<n; i++) {
823:         idx         = lindices[i];
824:         yv[idx]    += val[0];
825:         yv[idx+1]  += val[1];
826:         yv[idx+2]  += val[2];
827:         yv[idx+3]  += val[3];
828:         yv[idx+4]  += val[4];
829:         yv[idx+5]  += val[5];
830:         yv[idx+6]  += val[6];
831:         yv[idx+7]  += val[7];
832:         yv[idx+8]  += val[8];
833:         yv[idx+9]  += val[9];
834:         yv[idx+10] += val[10];
835:         yv[idx+11] += val[11];
836:         val        += 12;
837:       }
838: #if !defined(PETSC_USE_COMPLEX)
839:     } else if (addv == MAX_VALUES) {
840:       for (i=0; i<n; i++) {
841:         idx        = lindices[i];
842:         yv[idx]    = PetscMax(yv[idx],val[0]);
843:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
844:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
845:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
846:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
847:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
848:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
849:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
850:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
851:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
852:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
853:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
854:         val       += 12;
855:       }
856: #endif
857:     }  else {SETERRQ(1,"Wrong insert option");}
858:   } else { /* unpack each message as it arrives, default version */
859:     while (count) {
860:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
861:       /* unpack receives into our local space */
862:       val      = rvalues + 12*rstarts[imdex];
863:       lindices = indices + rstarts[imdex];
864:       n        = rstarts[imdex+1] - rstarts[imdex];
865:       if (addv == INSERT_VALUES) {
866:         for (i=0; i<n; i++) {
867:           idx        = lindices[i];
868:           yv[idx]    = val[0];
869:           yv[idx+1]  = val[1];
870:           yv[idx+2]  = val[2];
871:           yv[idx+3]  = val[3];
872:           yv[idx+4]  = val[4];
873:           yv[idx+5]  = val[5];
874:           yv[idx+6]  = val[6];
875:           yv[idx+7]  = val[7];
876:           yv[idx+8]  = val[8];
877:           yv[idx+9]  = val[9];
878:           yv[idx+10] = val[10];
879:           yv[idx+11] = val[11];
880:           val       += 12;
881:       }
882:     } else if (addv == ADD_VALUES) {
883:       for (i=0; i<n; i++) {
884:         idx        = lindices[i];
885:         yv[idx]    += val[0];
886:         yv[idx+1]  += val[1];
887:         yv[idx+2]  += val[2];
888:         yv[idx+3]  += val[3];
889:         yv[idx+4]  += val[4];
890:         yv[idx+5]  += val[5];
891:         yv[idx+6]  += val[6];
892:         yv[idx+7]  += val[7];
893:         yv[idx+8]  += val[8];
894:         yv[idx+9]  += val[9];
895:         yv[idx+10] += val[10];
896:         yv[idx+11] += val[11];
897:         val        += 12;
898:       }
899: #if !defined(PETSC_USE_COMPLEX)
900:     } else if (addv == MAX_VALUES) {
901:       for (i=0; i<n; i++) {
902:         idx        = lindices[i];
903:         yv[idx]    = PetscMax(yv[idx],val[0]);
904:         yv[idx+1]  = PetscMax(yv[idx+1],val[1]);
905:         yv[idx+2]  = PetscMax(yv[idx+2],val[2]);
906:         yv[idx+3]  = PetscMax(yv[idx+3],val[3]);
907:         yv[idx+4]  = PetscMax(yv[idx+4],val[4]);
908:         yv[idx+5]  = PetscMax(yv[idx+5],val[5]);
909:         yv[idx+6]  = PetscMax(yv[idx+6],val[6]);
910:         yv[idx+7]  = PetscMax(yv[idx+7],val[7]);
911:         yv[idx+8]  = PetscMax(yv[idx+8],val[8]);
912:         yv[idx+9]  = PetscMax(yv[idx+9],val[9]);
913:         yv[idx+10] = PetscMax(yv[idx+10],val[10]);
914:         yv[idx+11] = PetscMax(yv[idx+11],val[11]);
915:         val        += 12;
916:       }
917: #endif
918:     }  else {SETERRQ(1,"Wrong insert option");}
919:     count--;
920:     }
921:   }
922:   /* wait on sends */
923:   if (nsends) {
924:     MPI_Waitall(nsends,swaits,sstatus);
925:   }
926:   VecRestoreArray(yin,&yv);
927:   return(0);
928: }

930: /* --------------------------------------------------------------------------------------*/

932: int VecScatterBegin_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
933: {
934:   VecScatter_MPI_General *gen_to,*gen_from;
935:   Scalar                 *xv,*yv,*val,*svalues;
936:   MPI_Request            *rwaits,*swaits;
937:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

940:   VecGetArray(xin,&xv);
941:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
942:   if (mode & SCATTER_REVERSE) {
943:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
944:     gen_from = (VecScatter_MPI_General*)ctx->todata;
945:     rwaits   = gen_from->rev_requests;
946:     swaits   = gen_to->rev_requests;
947:   } else {
948:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
949:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
950:     rwaits   = gen_from->requests;
951:     swaits   = gen_to->requests;
952:   }
953:   svalues  = gen_to->values;
954:   nrecvs   = gen_from->n;
955:   nsends   = gen_to->n;
956:   indices  = gen_to->indices;
957:   sstarts  = gen_to->starts;

959:   if (!(mode & SCATTER_LOCAL)) {

961:     if (gen_to->sendfirst) {
962:       /* this version packs and sends one at a time */
963:       val  = svalues;
964:       for (i=0; i<nsends; i++) {
965:         iend = sstarts[i+1]-sstarts[i];

967:         for (j=0; j<iend; j++) {
968:           idx     = *indices++;
969:           val[0] = xv[idx];
970:           val[1] = xv[idx+1];
971:           val[2] = xv[idx+2];
972:           val[3] = xv[idx+3];
973:           val[4] = xv[idx+4];
974:           val    += 5;
975:         }
976:         MPI_Start_isend(5*iend,swaits+i);
977:       }
978:     }

980:     if (!gen_from->use_readyreceiver) {
981:       /* post receives since they were not posted in VecScatterPostRecvs()   */
982:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
983:     }

985:     if (!gen_to->sendfirst) {
986:       /* this version packs all the messages together and sends */
987:       /*
988:       len  = 5*sstarts[nsends];
989:       val  = svalues;
990:       for (i=0; i<len; i += 5) {
991:         idx     = *indices++;
992:         val[0] = xv[idx];
993:         val[1] = xv[idx+1];
994:         val[2] = xv[idx+2];
995:         val[3] = xv[idx+3];
996:         val[4] = xv[idx+4];
997:         val      += 5;
998:       }
999:       MPI_Startall_isend(len,nsends,swaits);
1000:       */

1002:       /* this version packs and sends one at a time */
1003:       val  = svalues;
1004:       for (i=0; i<nsends; i++) {
1005:         iend = sstarts[i+1]-sstarts[i];

1007:         for (j=0; j<iend; j++) {
1008:           idx     = *indices++;
1009:           val[0] = xv[idx];
1010:           val[1] = xv[idx+1];
1011:           val[2] = xv[idx+2];
1012:           val[3] = xv[idx+3];
1013:           val[4] = xv[idx+4];
1014:           val    += 5;
1015:         }
1016:         MPI_Start_isend(5*iend,swaits+i);
1017:       }
1018:     }
1019:   }

1021:   /* take care of local scatters */
1022:   if (gen_to->local.n) {
1023:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1024:     int n       = gen_to->local.n,il,ir;
1025:     if (addv == INSERT_VALUES) {
1026:       if (gen_to->local.is_copy) {
1027:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1028:       } else {
1029:         for (i=0; i<n; i++) {
1030:           il = fslots[i]; ir = tslots[i];
1031:           yv[il]   = xv[ir];
1032:           yv[il+1] = xv[ir+1];
1033:           yv[il+2] = xv[ir+2];
1034:           yv[il+3] = xv[ir+3];
1035:           yv[il+4] = xv[ir+4];
1036:         }
1037:       }
1038:     }  else if (addv == ADD_VALUES) {
1039:       for (i=0; i<n; i++) {
1040:         il = fslots[i]; ir = tslots[i];
1041:         yv[il]   += xv[ir];
1042:         yv[il+1] += xv[ir+1];
1043:         yv[il+2] += xv[ir+2];
1044:         yv[il+3] += xv[ir+3];
1045:         yv[il+4] += xv[ir+4];
1046:       }
1047: #if !defined(PETSC_USE_COMPLEX)
1048:     }  else if (addv == MAX_VALUES) {
1049:       for (i=0; i<n; i++) {
1050:         il = fslots[i]; ir = tslots[i];
1051:         yv[il]   = PetscMax(yv[il],xv[ir]);
1052:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1053:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1054:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1055:         yv[il+4] = PetscMax(yv[il+4],xv[ir+4]);
1056:       }
1057: #endif
1058:     }  else {SETERRQ(1,"Wrong insert option");}
1059:   }
1060:   VecRestoreArray(xin,&xv);
1061:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1062:   return(0);
1063: }

1065: /* --------------------------------------------------------------------------------------*/

1067: int VecScatterEnd_PtoP_5(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1068: {
1069:   VecScatter_MPI_General *gen_to,*gen_from;
1070:   Scalar                 *rvalues,*yv,*val;
1071:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1072:   MPI_Request            *rwaits,*swaits;
1073:   MPI_Status             rstatus,*sstatus;

1076:   if (mode & SCATTER_LOCAL) return(0);
1077:   VecGetArray(yin,&yv);

1079:   if (mode & SCATTER_REVERSE) {
1080:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1081:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1082:     rwaits   = gen_from->rev_requests;
1083:     swaits   = gen_to->rev_requests;
1084:     sstatus  = gen_from->sstatus;
1085:   } else {
1086:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1087:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1088:     rwaits   = gen_from->requests;
1089:     swaits   = gen_to->requests;
1090:     sstatus  = gen_to->sstatus;
1091:   }
1092:   rvalues  = gen_from->values;
1093:   nrecvs   = gen_from->n;
1094:   nsends   = gen_to->n;
1095:   indices  = gen_from->indices;
1096:   rstarts  = gen_from->starts;

1098:   /*  wait on receives */
1099:   count = nrecvs;
1100:   while (count) {
1101:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1102:     /* unpack receives into our local space */
1103:     val      = rvalues + 5*rstarts[imdex];
1104:     lindices = indices + rstarts[imdex];
1105:     n        = rstarts[imdex+1] - rstarts[imdex];
1106:     if (addv == INSERT_VALUES) {
1107:       for (i=0; i<n; i++) {
1108:         idx       = lindices[i];
1109:         yv[idx]   = val[0];
1110:         yv[idx+1] = val[1];
1111:         yv[idx+2] = val[2];
1112:         yv[idx+3] = val[3];
1113:         yv[idx+4] = val[4];
1114:         val      += 5;
1115:       }
1116:     } else if (addv == ADD_VALUES) {
1117:       for (i=0; i<n; i++) {
1118:         idx       = lindices[i];
1119:         yv[idx]   += val[0];
1120:         yv[idx+1] += val[1];
1121:         yv[idx+2] += val[2];
1122:         yv[idx+3] += val[3];
1123:         yv[idx+4] += val[4];
1124:         val       += 5;
1125:       }
1126: #if !defined(PETSC_USE_COMPLEX)
1127:     } else if (addv == MAX_VALUES) {
1128:       for (i=0; i<n; i++) {
1129:         idx       = lindices[i];
1130:         yv[idx]   = PetscMax(yv[idx],val[0]);
1131:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1132:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1133:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1134:         yv[idx+4] = PetscMax(yv[idx+4],val[4]);
1135:         val       += 5;
1136:       }
1137: #endif
1138:     }  else {SETERRQ(1,"Wrong insert option");}
1139:     count--;
1140:   }
1141:   /* wait on sends */
1142:   if (nsends) {
1143:     MPI_Waitall(nsends,swaits,sstatus);
1144:   }
1145:   VecRestoreArray(yin,&yv);
1146:   return(0);
1147: }

1149: /* --------------------------------------------------------------------------------------*/

1151: int VecScatterBegin_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1152: {
1153:   VecScatter_MPI_General *gen_to,*gen_from;
1154:   Scalar                 *xv,*yv,*val,*svalues;
1155:   MPI_Request            *rwaits,*swaits;
1156:   int                    *indices,*sstarts,iend,i,j,nrecvs,nsends,idx,ierr,len;

1159:   VecGetArray(xin,&xv);
1160:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1162:   if (mode & SCATTER_REVERSE) {
1163:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1164:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1165:     rwaits   = gen_from->rev_requests;
1166:     swaits   = gen_to->rev_requests;
1167:   } else {
1168:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1169:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1170:     rwaits   = gen_from->requests;
1171:     swaits   = gen_to->requests;
1172:   }
1173:   svalues  = gen_to->values;
1174:   nrecvs   = gen_from->n;
1175:   nsends   = gen_to->n;
1176:   indices  = gen_to->indices;
1177:   sstarts  = gen_to->starts;

1179:   if (!(mode & SCATTER_LOCAL)) {

1181:     if (!gen_from->use_readyreceiver && !gen_to->sendfirst) {
1182:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1183:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1184:     }

1186:     if (ctx->packtogether) {
1187:       /* this version packs all the messages together and sends, when -vecscatter_packtogether used */
1188:       len  = 4*sstarts[nsends];
1189:       val  = svalues;
1190:       for (i=0; i<len; i += 4) {
1191:         idx    = *indices++;
1192:         val[0] = xv[idx];
1193:         val[1] = xv[idx+1];
1194:         val[2] = xv[idx+2];
1195:         val[3] = xv[idx+3];
1196:         val    += 4;
1197:       }
1198:       MPI_Startall_isend(len,nsends,swaits);
1199:     } else {
1200:       /* this version packs and sends one at a time, default */
1201:       val  = svalues;
1202:       for (i=0; i<nsends; i++) {
1203:         iend = sstarts[i+1]-sstarts[i];

1205:         for (j=0; j<iend; j++) {
1206:           idx     = *indices++;
1207:           val[0] = xv[idx];
1208:           val[1] = xv[idx+1];
1209:           val[2] = xv[idx+2];
1210:           val[3] = xv[idx+3];
1211:           val    += 4;
1212:         }
1213:         MPI_Start_isend(4*iend,swaits+i);
1214:       }
1215:     }

1217:     if (!gen_from->use_readyreceiver && gen_to->sendfirst) {
1218:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1219:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1220:     }
1221:   }

1223:   /* take care of local scatters */
1224:   if (gen_to->local.n) {
1225:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1226:     int n       = gen_to->local.n,il,ir;
1227:     if (addv == INSERT_VALUES) {
1228:       if (gen_to->local.is_copy) {
1229:         PetscMemcpy(yv+gen_from->local.copy_start,xv+gen_to->local.copy_start,gen_to->local.copy_length);
1230:       } else {
1231:         for (i=0; i<n; i++) {
1232:           il = fslots[i]; ir = tslots[i];
1233:           yv[il]   = xv[ir];
1234:           yv[il+1] = xv[ir+1];
1235:           yv[il+2] = xv[ir+2];
1236:           yv[il+3] = xv[ir+3];
1237:         }
1238:       }
1239:     }  else if (addv == ADD_VALUES) {
1240:       for (i=0; i<n; i++) {
1241:         il = fslots[i]; ir = tslots[i];
1242:         yv[il]   += xv[ir];
1243:         yv[il+1] += xv[ir+1];
1244:         yv[il+2] += xv[ir+2];
1245:         yv[il+3] += xv[ir+3];
1246:       }
1247: #if !defined(PETSC_USE_COMPLEX)
1248:     }  else if (addv == MAX_VALUES) {
1249:       for (i=0; i<n; i++) {
1250:         il = fslots[i]; ir = tslots[i];
1251:         yv[il]   = PetscMax(yv[il],xv[ir]);
1252:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1253:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1254:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1255:       }
1256: #endif
1257:     }  else {SETERRQ(1,"Wrong insert option");}
1258:   }
1259:   VecRestoreArray(xin,&xv);
1260:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1261:   return(0);
1262: }

1264: /* --------------------------------------------------------------------------------------*/

1266: int VecScatterEnd_PtoP_4(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1267: {
1268:   VecScatter_MPI_General *gen_to,*gen_from;
1269:   Scalar                 *rvalues,*yv,*val;
1270:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1271:   MPI_Request            *rwaits,*swaits;
1272:   MPI_Status             *rstatus,*sstatus;

1275:   if (mode & SCATTER_LOCAL) return(0);
1276:   VecGetArray(yin,&yv);

1278:   if (mode & SCATTER_REVERSE) {
1279:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1280:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1281:     rwaits   = gen_from->rev_requests;
1282:     swaits   = gen_to->rev_requests;
1283:     sstatus  = gen_from->sstatus;
1284:     rstatus  = gen_from->rstatus;
1285:   } else {
1286:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1287:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1288:     rwaits   = gen_from->requests;
1289:     swaits   = gen_to->requests;
1290:     sstatus  = gen_to->sstatus;
1291:     rstatus  = gen_to->rstatus;
1292:   }
1293:   rvalues  = gen_from->values;
1294:   nrecvs   = gen_from->n;
1295:   nsends   = gen_to->n;
1296:   indices  = gen_from->indices;
1297:   rstarts  = gen_from->starts;

1299:   /*  wait on receives */
1300:   count = nrecvs;
1301:   if (ctx->packtogether) { /* receive all messages, then unpack all, when -vecscatter_packtogether used */
1302:     ierr     = MPI_Waitall(nrecvs,rwaits,rstatus);
1303:     n        = rstarts[count];
1304:     val      = rvalues;
1305:     lindices = indices;
1306:     if (addv == INSERT_VALUES) {
1307:       for (i=0; i<n; i++) {
1308:         idx       = lindices[i];
1309:         yv[idx]   = val[0];
1310:         yv[idx+1] = val[1];
1311:         yv[idx+2] = val[2];
1312:         yv[idx+3] = val[3];
1313:         val      += 4;
1314:       }
1315:     } else if (addv == ADD_VALUES) {
1316:       for (i=0; i<n; i++) {
1317:         idx       = lindices[i];
1318:         yv[idx]   += val[0];
1319:         yv[idx+1] += val[1];
1320:         yv[idx+2] += val[2];
1321:         yv[idx+3] += val[3];
1322:         val       += 4;
1323:       }
1324: #if !defined(PETSC_USE_COMPLEX)
1325:     } else if (addv == MAX_VALUES) {
1326:       for (i=0; i<n; i++) {
1327:         idx       = lindices[i];
1328:         yv[idx]   = PetscMax(yv[idx],val[0]);
1329:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1330:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1331:         yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1332:         val       += 4;
1333:       }
1334: #endif
1335:     }  else {SETERRQ(1,"Wrong insert option");}
1336:   } else { /* unpack each message as it arrives, default version */
1337:     while (count) {
1338:       MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus[0]);
1339:       /* unpack receives into our local space */
1340:       val      = rvalues + 4*rstarts[imdex];
1341:       lindices = indices + rstarts[imdex];
1342:       n        = rstarts[imdex+1] - rstarts[imdex];
1343:       if (addv == INSERT_VALUES) {
1344:         for (i=0; i<n; i++) {
1345:           idx       = lindices[i];
1346:           yv[idx]   = val[0];
1347:           yv[idx+1] = val[1];
1348:           yv[idx+2] = val[2];
1349:           yv[idx+3] = val[3];
1350:           val      += 4;
1351:         }
1352:       } else if (addv == ADD_VALUES) {
1353:         for (i=0; i<n; i++) {
1354:           idx       = lindices[i];
1355:           yv[idx]   += val[0];
1356:           yv[idx+1] += val[1];
1357:           yv[idx+2] += val[2];
1358:           yv[idx+3] += val[3];
1359:           val       += 4;
1360:         }
1361: #if !defined(PETSC_USE_COMPLEX)
1362:       } else if (addv == MAX_VALUES) {
1363:         for (i=0; i<n; i++) {
1364:           idx       = lindices[i];
1365:           yv[idx]   = PetscMax(yv[idx],val[0]);
1366:           yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1367:           yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1368:           yv[idx+3] = PetscMax(yv[idx+3],val[3]);
1369:           val       += 4;
1370:         }
1371: #endif
1372:       }  else {SETERRQ(1,"Wrong insert option");}
1373:       count--;
1374:     }
1375:   }

1377:   /* wait on sends */
1378:   if (nsends) {
1379:     MPI_Waitall(nsends,swaits,sstatus);
1380:   }
1381:   VecRestoreArray(yin,&yv);
1382:   return(0);
1383: }

1385: /* --------------------------------------------------------------------------------------*/

1387: int VecScatterBegin_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1388: {
1389:   VecScatter_MPI_General *gen_to,*gen_from;
1390:   Scalar                 *xv,*yv,*val,*svalues;
1391:   MPI_Request            *rwaits,*swaits;
1392:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1395:   VecGetArray(xin,&xv);
1396:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}

1398:   if (mode & SCATTER_REVERSE) {
1399:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1400:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1401:     rwaits   = gen_from->rev_requests;
1402:     swaits   = gen_to->rev_requests;
1403:   } else {
1404:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1405:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1406:     rwaits   = gen_from->requests;
1407:     swaits   = gen_to->requests;
1408:   }
1409:   svalues  = gen_to->values;
1410:   nrecvs   = gen_from->n;
1411:   nsends   = gen_to->n;
1412:   indices  = gen_to->indices;
1413:   sstarts  = gen_to->starts;

1415:   if (!(mode & SCATTER_LOCAL)) {

1417:     if (gen_to->sendfirst) {
1418:       /* this version packs and sends one at a time */
1419:       val  = svalues;
1420:       for (i=0; i<nsends; i++) {
1421:         iend = sstarts[i+1]-sstarts[i];

1423:         for (j=0; j<iend; j++) {
1424:           idx     = *indices++;
1425:           val[0] = xv[idx];
1426:           val[1] = xv[idx+1];
1427:           val[2] = xv[idx+2];
1428:           val    += 3;
1429:         }
1430:         MPI_Start_isend(3*iend,swaits+i);
1431:       }
1432:     }

1434:     if (!gen_from->use_readyreceiver) {
1435:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1436:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1437:     }

1439:     if (!gen_to->sendfirst) {
1440:       /* this version packs all the messages together and sends */
1441:       /*
1442:       len  = 3*sstarts[nsends];
1443:       val  = svalues;
1444:       for (i=0; i<len; i += 3) {
1445:         idx     = *indices++;
1446:         val[0] = xv[idx];
1447:         val[1] = xv[idx+1];
1448:         val[2] = xv[idx+2];
1449:         val      += 3;
1450:       }
1451:       MPI_Startall_isend(len,nsends,swaits);
1452:       */

1454:       /* this version packs and sends one at a time */
1455:       val  = svalues;
1456:       for (i=0; i<nsends; i++) {
1457:         iend = sstarts[i+1]-sstarts[i];

1459:         for (j=0; j<iend; j++) {
1460:           idx     = *indices++;
1461:           val[0] = xv[idx];
1462:           val[1] = xv[idx+1];
1463:           val[2] = xv[idx+2];
1464:           val    += 3;
1465:         }
1466:         MPI_Start_isend(3*iend,swaits+i);
1467:       }
1468:     }
1469:   }

1471:   /* take care of local scatters */
1472:   if (gen_to->local.n) {
1473:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1474:     int n       = gen_to->local.n,il,ir;
1475:     if (addv == INSERT_VALUES) {
1476:       if (gen_to->local.is_copy) {
1477:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1478:       } else {
1479:         for (i=0; i<n; i++) {
1480:           il = fslots[i]; ir = tslots[i];
1481:           yv[il]   = xv[ir];
1482:           yv[il+1] = xv[ir+1];
1483:           yv[il+2] = xv[ir+2];
1484:           yv[il+3] = xv[ir+3];
1485:         }
1486:       }
1487:     }  else if (addv == ADD_VALUES) {
1488:       for (i=0; i<n; i++) {
1489:         il = fslots[i]; ir = tslots[i];
1490:         yv[il]   += xv[ir];
1491:         yv[il+1] += xv[ir+1];
1492:         yv[il+2] += xv[ir+2];
1493:         yv[il+3] += xv[ir+3];
1494:       }
1495: #if !defined(PETSC_USE_COMPLEX)
1496:     }  else if (addv == MAX_VALUES) {
1497:       for (i=0; i<n; i++) {
1498:         il = fslots[i]; ir = tslots[i];
1499:         yv[il]   = PetscMax(yv[il],xv[ir]);
1500:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1501:         yv[il+2] = PetscMax(yv[il+2],xv[ir+2]);
1502:         yv[il+3] = PetscMax(yv[il+3],xv[ir+3]);
1503:       }
1504: #endif
1505:     }  else {SETERRQ(1,"Wrong insert option");}
1506:   }
1507:   VecRestoreArray(xin,&xv);
1508:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1509:   return(0);
1510: }

1512: /* --------------------------------------------------------------------------------------*/

1514: int VecScatterEnd_PtoP_3(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1515: {
1516:   VecScatter_MPI_General *gen_to,*gen_from;
1517:   Scalar                 *rvalues,*yv,*val;
1518:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1519:   MPI_Request            *rwaits,*swaits;
1520:   MPI_Status             rstatus,*sstatus;

1523:   if (mode & SCATTER_LOCAL) return(0);
1524:   VecGetArray(yin,&yv);

1526:   if (mode & SCATTER_REVERSE) {
1527:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1528:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1529:     rwaits   = gen_from->rev_requests;
1530:     swaits   = gen_to->rev_requests;
1531:     sstatus  = gen_from->sstatus;
1532:   } else {
1533:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1534:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1535:     rwaits   = gen_from->requests;
1536:     swaits   = gen_to->requests;
1537:     sstatus  = gen_to->sstatus;
1538:   }
1539:   rvalues  = gen_from->values;
1540:   nrecvs   = gen_from->n;
1541:   nsends   = gen_to->n;
1542:   indices  = gen_from->indices;
1543:   rstarts  = gen_from->starts;

1545:   /*  wait on receives */
1546:   count = nrecvs;
1547:   while (count) {
1548:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1549:     /* unpack receives into our local space */
1550:     val      = rvalues + 3*rstarts[imdex];
1551:     lindices = indices + rstarts[imdex];
1552:     n        = rstarts[imdex+1] - rstarts[imdex];
1553:     if (addv == INSERT_VALUES) {
1554:       for (i=0; i<n; i++) {
1555:         idx       = lindices[i];
1556:         yv[idx]   = val[0];
1557:         yv[idx+1] = val[1];
1558:         yv[idx+2] = val[2];
1559:         val      += 3;
1560:       }
1561:     } else if (addv == ADD_VALUES) {
1562:       for (i=0; i<n; i++) {
1563:         idx       = lindices[i];
1564:         yv[idx]   += val[0];
1565:         yv[idx+1] += val[1];
1566:         yv[idx+2] += val[2];
1567:         val       += 3;
1568:       }
1569: #if !defined(PETSC_USE_COMPLEX)
1570:     } else if (addv == MAX_VALUES) {
1571:       for (i=0; i<n; i++) {
1572:         idx       = lindices[i];
1573:         yv[idx]   = PetscMax(yv[idx],val[0]);
1574:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1575:         yv[idx+2] = PetscMax(yv[idx+2],val[2]);
1576:         val       += 3;
1577:       }
1578: #endif
1579:     }  else {SETERRQ(1,"Wrong insert option");}
1580:     count--;
1581:   }
1582:   /* wait on sends */
1583:   if (nsends) {
1584:     MPI_Waitall(nsends,swaits,sstatus);
1585:   }
1586:   VecRestoreArray(yin,&yv);
1587:   return(0);
1588: }

1590: /* --------------------------------------------------------------------------------------*/

1592: int VecScatterBegin_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1593: {
1594:   VecScatter_MPI_General *gen_to,*gen_from;
1595:   Scalar                 *xv,*yv,*val,*svalues;
1596:   MPI_Request            *rwaits,*swaits;
1597:   int                    ierr,i,*indices,*sstarts,iend,j,nrecvs,nsends,idx;

1600:   VecGetArray(xin,&xv);
1601:   if (xin != yin) {VecGetArray(yin,&yv);} else {yv = xv;}
1602:   if (mode & SCATTER_REVERSE) {
1603:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1604:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1605:     rwaits   = gen_from->rev_requests;
1606:     swaits   = gen_to->rev_requests;
1607:   } else {
1608:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1609:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1610:     rwaits   = gen_from->requests;
1611:     swaits   = gen_to->requests;
1612:   }
1613:   svalues  = gen_to->values;
1614:   nrecvs   = gen_from->n;
1615:   nsends   = gen_to->n;
1616:   indices  = gen_to->indices;
1617:   sstarts  = gen_to->starts;

1619:   if (!(mode & SCATTER_LOCAL)) {

1621:     if (gen_to->sendfirst) {
1622:       /* this version packs and sends one at a time */
1623:       val  = svalues;
1624:       for (i=0; i<nsends; i++) {
1625:         iend = sstarts[i+1]-sstarts[i];

1627:         for (j=0; j<iend; j++) {
1628:           idx     = *indices++;
1629:           val[0] = xv[idx];
1630:           val[1] = xv[idx+1];
1631:           val    += 2;
1632:         }
1633:         MPI_Start_isend(2*iend,swaits+i);
1634:       }
1635:     }

1637:     if (!gen_from->use_readyreceiver) {
1638:       /* post receives since they were not posted in VecScatterPostRecvs()   */
1639:       MPI_Startall_irecv(gen_from->starts[nrecvs]*gen_from->bs,nrecvs,rwaits);
1640:     }

1642:     if (!gen_to->sendfirst) {
1643:       /* this version packs all the messages together and sends */
1644:       /*
1645:       len  = 2*sstarts[nsends];
1646:       val  = svalues;
1647:       for (i=0; i<len; i += 2) {
1648:         idx     = *indices++;
1649:         val[0] = xv[idx];
1650:         val[1] = xv[idx+1];
1651:         val      += 2;
1652:       }
1653:       MPI_Startall_isend(len,nsends,swaits);
1654:       */

1656:       /* this version packs and sends one at a time */
1657:       val  = svalues;
1658:       for (i=0; i<nsends; i++) {
1659:         iend = sstarts[i+1]-sstarts[i];

1661:         for (j=0; j<iend; j++) {
1662:           idx     = *indices++;
1663:           val[0] = xv[idx];
1664:           val[1] = xv[idx+1];
1665:           val    += 2;
1666:         }
1667:         MPI_Start_isend(2*iend,swaits+i);
1668:       }
1669:     }
1670:   }

1672:   /* take care of local scatters */
1673:   if (gen_to->local.n) {
1674:     int *tslots = gen_to->local.slots,*fslots = gen_from->local.slots;
1675:     int n       = gen_to->local.n,il,ir;
1676:     if (addv == INSERT_VALUES) {
1677:       if (gen_to->local.is_copy) {
1678:         PetscMemcpy(yv + gen_from->local.copy_start,xv + gen_to->local.copy_start,gen_to->local.copy_length);
1679:       } else {
1680:         for (i=0; i<n; i++) {
1681:           il = fslots[i]; ir = tslots[i];
1682:           yv[il]   = xv[ir];
1683:           yv[il+1] = xv[ir+1];
1684:         }
1685:       }
1686:     }  else if (addv == ADD_VALUES) {
1687:       for (i=0; i<n; i++) {
1688:         il = fslots[i]; ir = tslots[i];
1689:         yv[il]   += xv[ir];
1690:         yv[il+1] += xv[ir+1];
1691:       }
1692: #if !defined(PETSC_USE_COMPLEX)
1693:     }  else if (addv == MAX_VALUES) {
1694:       for (i=0; i<n; i++) {
1695:         il = fslots[i]; ir = tslots[i];
1696:         yv[il]   = PetscMax(yv[il],xv[ir]);
1697:         yv[il+1] = PetscMax(yv[il+1],xv[ir+1]);
1698:       }
1699: #endif
1700:     }  else {SETERRQ(1,"Wrong insert option");}
1701:   }
1702:   VecRestoreArray(xin,&xv);
1703:   if (xin != yin) {VecRestoreArray(yin,&yv);}
1704:   return(0);
1705: }

1707: /* --------------------------------------------------------------------------------------*/

1709: int VecScatterEnd_PtoP_2(Vec xin,Vec yin,InsertMode addv,ScatterMode mode,VecScatter ctx)
1710: {
1711:   VecScatter_MPI_General *gen_to,*gen_from;
1712:   Scalar                 *rvalues,*yv,*val;
1713:   int                    ierr,nrecvs,nsends,i,*indices,count,imdex,n,*rstarts,*lindices,idx;
1714:   MPI_Request            *rwaits,*swaits;
1715:   MPI_Status             rstatus,*sstatus;

1718:   if (mode & SCATTER_LOCAL) return(0);
1719:   VecGetArray(yin,&yv);

1721:   if (mode & SCATTER_REVERSE) {
1722:     gen_to   = (VecScatter_MPI_General*)ctx->fromdata;
1723:     gen_from = (VecScatter_MPI_General*)ctx->todata;
1724:     rwaits   = gen_from->rev_requests;
1725:     swaits   = gen_to->rev_requests;
1726:     sstatus  = gen_from->sstatus;
1727:   } else {
1728:     gen_to   = (VecScatter_MPI_General*)ctx->todata;
1729:     gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1730:     rwaits   = gen_from->requests;
1731:     swaits   = gen_to->requests;
1732:     sstatus  = gen_to->sstatus;
1733:   }
1734:   rvalues  = gen_from->values;
1735:   nrecvs   = gen_from->n;
1736:   nsends   = gen_to->n;
1737:   indices  = gen_from->indices;
1738:   rstarts  = gen_from->starts;

1740:   /*  wait on receives */
1741:   count = nrecvs;
1742:   while (count) {
1743:     MPI_Waitany(nrecvs,rwaits,&imdex,&rstatus);
1744:     /* unpack receives into our local space */
1745:     val      = rvalues + 2*rstarts[imdex];
1746:     lindices = indices + rstarts[imdex];
1747:     n        = rstarts[imdex+1] - rstarts[imdex];
1748:     if (addv == INSERT_VALUES) {
1749:       for (i=0; i<n; i++) {
1750:         idx       = lindices[i];
1751:         yv[idx]   = val[0];
1752:         yv[idx+1] = val[1];
1753:         val      += 2;
1754:       }
1755:     } else if (addv == ADD_VALUES) {
1756:       for (i=0; i<n; i++) {
1757:         idx       = lindices[i];
1758:         yv[idx]   += val[0];
1759:         yv[idx+1] += val[1];
1760:         val       += 2;
1761:       }
1762: #if !defined(PETSC_USE_COMPLEX)
1763:     } else if (addv == MAX_VALUES) {
1764:       for (i=0; i<n; i++) {
1765:         idx       = lindices[i];
1766:         yv[idx]   = PetscMax(yv[idx],val[0]);
1767:         yv[idx+1] = PetscMax(yv[idx+1],val[1]);
1768:         val       += 2;
1769:       }
1770: #endif
1771:     }  else {SETERRQ(1,"Wrong insert option");}
1772:     count--;
1773:   }
1774:   /* wait on sends */
1775:   if (nsends) {
1776:     MPI_Waitall(nsends,swaits,sstatus);
1777:   }
1778:   VecRestoreArray(yin,&yv);
1779:   return(0);
1780: }

1782: /* ---------------------------------------------------------------------------------*/

1784: int VecScatterDestroy_PtoP_X(VecScatter ctx)
1785: {
1786:   VecScatter_MPI_General *gen_to   = (VecScatter_MPI_General*)ctx->todata;
1787:   VecScatter_MPI_General *gen_from = (VecScatter_MPI_General*)ctx->fromdata;
1788:   int                    i,ierr;

1791:   if (gen_to->use_readyreceiver) {
1792:     /*
1793:        Since we have already posted sends we must cancel them before freeing 
1794:        the requests
1795:     */
1796:     for (i=0; i<gen_from->n; i++) {
1797:       MPI_Cancel(gen_from->requests+i);
1798:     }
1799:   }

1801:   if (gen_to->local.slots) {PetscFree(gen_to->local.slots);}
1802:   if (gen_from->local.slots) {PetscFree(gen_from->local.slots);}
1803:   if (gen_to->local.slots_nonmatching) {PetscFree(gen_to->local.slots_nonmatching);}
1804:   if (gen_from->local.slots_nonmatching) {PetscFree(gen_from->local.slots_nonmatching);}

1806:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
1807:   /* 
1808:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
1809:      message passing.
1810:   */
1811: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
1812:   for (i=0; i<gen_to->n; i++) {
1813:     MPI_Request_free(gen_to->requests + i);
1814:     MPI_Request_free(gen_to->rev_requests + i);
1815:   }

1817:   /*
1818:       MPICH could not properly cancel requests thus with ready receiver mode we
1819:     cannot free the requests. It may be fixed now, if not then put the following 
1820:     code inside a if !gen_to->use_readyreceiver) {
1821:   */
1822:   for (i=0; i<gen_from->n; i++) {
1823:     MPI_Request_free(gen_from->requests + i);
1824:     MPI_Request_free(gen_from->rev_requests + i);
1825:   }
1826: #endif
1827: 
1828:   PetscFree(gen_to->sstatus);
1829:   PetscFree(gen_to->values);
1830:   PetscFree(gen_to->rev_requests);
1831:   PetscFree(gen_to);
1832:   PetscFree(gen_from->values);
1833:   PetscFree(gen_from->rev_requests);
1834:   PetscFree(gen_from);
1835:   PetscLogObjectDestroy(ctx);
1836:   PetscHeaderDestroy(ctx);
1837:   return(0);
1838: }

1840: /* ==========================================================================================*/

1842: /*              create parallel to sequential scatter context                           */
1843: /*
1844:    bs indicates how many elements there are in each block. Normally
1845:    this would be 1.
1846: */
1847: int VecScatterCreate_PtoS(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,int bs,VecScatter ctx)
1848: {
1849:   VecScatter_MPI_General *from,*to;
1850:   int                    *source,*lens,rank,*owners;
1851:   int                    size,*lowner,*start,lengthy;
1852:   int                    *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
1853:   int                    *owner,*starts,count,tag,slen,ierr;
1854:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len,*indx,nprocslocal;
1855:   MPI_Comm               comm;
1856:   MPI_Request            *send_waits,*recv_waits;
1857:   MPI_Status             recv_status,*send_status;
1858:   Map                    map;
1859:   PetscTruth             found;
1860: 
1862:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1863:   PetscObjectGetComm((PetscObject)xin,&comm);
1864:   MPI_Comm_rank(comm,&rank);
1865:   MPI_Comm_size(comm,&size);
1866:   VecGetMap(xin,&map);
1867:   MapGetGlobalRange(map,&owners);

1869:   VecGetSize(yin,&lengthy);

1871:   /*  first count number of contributors to each processor */
1872:   PetscMalloc(2*size*sizeof(int),&nprocs);
1873:   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));
1874:   procs  = nprocs + size;
1875:   PetscMalloc((nx+1)*sizeof(int),&owner);
1876:   for (i=0; i<nx; i++) {
1877:     idx = inidx[i];
1878:     found = PETSC_FALSE;
1879:     for (j=0; j<size; j++) {
1880:       if (idx >= owners[j] && idx < owners[j+1]) {
1881:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
1882:       }
1883:     }
1884:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
1885:   }
1886:   nprocslocal  = nprocs[rank];
1887:   nprocs[rank] = procs[rank] = 0;
1888:   nsends       = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

1890:   /* inform other processors of number of messages and max length*/
1891:   PetscMalloc(2*size*sizeof(int),&work);
1892:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
1893:   nmax   = work[rank];
1894:   nrecvs = work[size+rank];
1895:   ierr   = PetscFree(work);

1897:   /* post receives:   */
1898:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
1899:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
1900:   for (i=0; i<nrecvs; i++) {
1901:     MPI_Irecv((rvalues+nmax*i),nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1902:   }

1904:   /* do sends:
1905:       1) starts[i] gives the starting index in svalues for stuff going to 
1906:          the ith processor
1907:   */
1908:   PetscMalloc((nx+1)*sizeof(int),&svalues);
1909:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
1910:   PetscMalloc((size+1)*sizeof(int),&starts);
1911:   starts[0]  = 0;
1912:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1913:   for (i=0; i<nx; i++) {
1914:     if (owner[i] != rank) {
1915:       svalues[starts[owner[i]]++] = inidx[i];
1916:     }
1917:   }

1919:   starts[0] = 0;
1920:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
1921:   count = 0;
1922:   for (i=0; i<size; i++) {
1923:     if (procs[i]) {
1924:       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count++);
1925:     }
1926:   }
1927:   PetscFree(starts);

1929:   base = owners[rank];

1931:   /*  wait on receives */
1932:   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
1933:   source = lens + nrecvs;
1934:   count  = nrecvs;
1935:   slen   = 0;
1936:   while (count) {
1937:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1938:     /* unpack receives into our local space */
1939:     MPI_Get_count(&recv_status,MPI_INT,&n);
1940:     source[imdex]  = recv_status.MPI_SOURCE;
1941:     lens[imdex]    = n;
1942:     slen          += n;
1943:     count--;
1944:   }
1945:   PetscFree(recv_waits);
1946: 
1947:   /* allocate entire send scatter context */
1948:   PetscNew(VecScatter_MPI_General,&to);
1949:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1950:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
1951:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1952:   len = slen*sizeof(int) + bs*slen*sizeof(Scalar) + (nrecvs+1)*sizeof(int) +
1953:         nrecvs*(sizeof(int) + sizeof(MPI_Request));
1954:   to->n         = nrecvs;
1955:   PetscMalloc(len,&to->values);
1956:   PetscLogObjectMemory(ctx,len);
1957:   to->requests  = (MPI_Request*)(to->values + bs*slen);
1958:   to->indices   = (int*)(to->requests + nrecvs);
1959:   to->starts    = (int*)(to->indices + slen);
1960:   to->procs     = (int*)(to->starts + nrecvs + 1);
1961:   ierr          = PetscMalloc(2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status),&to->sstatus);
1962:   to->rstatus   = to->sstatus + PetscMax(nrecvs,nsends) + 1;
1963:   PetscLogObjectMemory(ctx,2*(PetscMax(nrecvs,nsends)+1)*sizeof(MPI_Status));
1964:   ctx->todata   = (void*)to;
1965:   to->starts[0] = 0;

1967:   if (nrecvs) {
1968:     PetscMalloc(nrecvs*sizeof(int),&indx);
1969:     for (i=0; i<nrecvs; i++) indx[i] = i;
1970:     PetscSortIntWithPermutation(nrecvs,source,indx);

1972:     /* move the data into the send scatter */
1973:     for (i=0; i<nrecvs; i++) {
1974:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
1975:       to->procs[i]    = source[indx[i]];
1976:       values = rvalues + indx[i]*nmax;
1977:       for (j=0; j<lens[indx[i]]; j++) {
1978:         to->indices[to->starts[i] + j] = values[j] - base;
1979:       }
1980:     }
1981:     PetscFree(indx);
1982:   }
1983:   PetscFree(rvalues);
1984:   PetscFree(lens);
1985: 
1986:   /* allocate entire receive scatter context */
1987:   PetscNew(VecScatter_MPI_General,&from);
1988:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
1989:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
1990:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
1991:   len = ny*sizeof(int) + ny*bs*sizeof(Scalar) + (nsends+1)*sizeof(int) +
1992:         nsends*(sizeof(int) + sizeof(MPI_Request));
1993:   from->n        = nsends;
1994:   PetscMalloc(len,&from->values);
1995:   PetscLogObjectMemory(ctx,len);
1996:   from->requests = (MPI_Request*)(from->values + bs*ny);
1997:   from->indices  = (int*)(from->requests + nsends);
1998:   from->starts   = (int*)(from->indices + ny);
1999:   from->procs    = (int*)(from->starts + nsends + 1);
2000:   ctx->fromdata  = (void*)from;

2002:   /* move data into receive scatter */
2003:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2004:   start = lowner + size;
2005:   count = 0; from->starts[0] = start[0] = 0;
2006:   for (i=0; i<size; i++) {
2007:     if (procs[i]) {
2008:       lowner[i]            = count;
2009:       from->procs[count++] = i;
2010:       from->starts[count]  = start[count] = start[count-1] + nprocs[i];
2011:     }
2012:   }
2013:   for (i=0; i<nx; i++) {
2014:     if (owner[i] != rank) {
2015:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
2016:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2017:     }
2018:   }
2019:   PetscFree(lowner);
2020:   PetscFree(owner);
2021:   PetscFree(nprocs);
2022: 
2023:   /* wait on sends */
2024:   if (nsends) {
2025:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2026:     MPI_Waitall(nsends,send_waits,send_status);
2027:     PetscFree(send_status);
2028:   }
2029:   PetscFree(send_waits);
2030:   PetscFree(svalues);

2032:   if (nprocslocal) {
2033:     int nt;
2034:     /* we have a scatter to ourselves */
2035:     from->local.n     = to->local.n = nt = nprocslocal;
2036:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2037:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2038:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2039:     nt = 0;
2040:     for (i=0; i<nx; i++) {
2041:       idx = inidx[i];
2042:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2043:         to->local.slots[nt]     = idx - owners[rank];
2044:         from->local.slots[nt++] = inidy[i];
2045:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2046:       }
2047:     }
2048:   } else {
2049:     from->local.n     = 0;
2050:     from->local.slots = 0;
2051:     to->local.n       = 0;
2052:     to->local.slots   = 0;
2053:   }
2054:   from->local.nonmatching_computed = PETSC_FALSE;
2055:   from->local.n_nonmatching        = 0;
2056:   from->local.slots_nonmatching    = 0;
2057:   to->local.nonmatching_computed   = PETSC_FALSE;
2058:   to->local.n_nonmatching          = 0;
2059:   to->local.slots_nonmatching      = 0;

2061:   to->type   = VEC_SCATTER_MPI_GENERAL;
2062:   from->type = VEC_SCATTER_MPI_GENERAL;

2064:   from->bs = bs;
2065:   to->bs   = bs;
2066:   if (bs > 1) {
2067:     PetscTruth  flg,flgs = PETSC_FALSE;
2068:     int         *sstarts = to->starts,  *rstarts = from->starts;
2069:     int         *sprocs  = to->procs,   *rprocs  = from->procs;
2070:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
2071:     MPI_Request *rev_swaits,*rev_rwaits;
2072:     Scalar      *Ssvalues = to->values, *Srvalues = from->values;

2074:     ctx->destroy   = VecScatterDestroy_PtoP_X;
2075:     ctx->copy      = VecScatterCopy_PtoP_X;
2076:     ctx->view      = VecScatterView_MPI;
2077: 
2078:     tag      = ctx->tag;
2079:     comm     = ctx->comm;

2081:     /* allocate additional wait variables for the "reverse" scatter */
2082:     PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&rev_rwaits);
2083:     to->rev_requests   = rev_rwaits;
2084:     PetscMalloc((nsends+1)*sizeof(MPI_Request),&rev_swaits);
2085:     from->rev_requests = rev_swaits;
2086:     PetscLogObjectMemory(ctx,(nsends+nrecvs+2)*sizeof(MPI_Request));

2088:     /* Register the receives that you will use later (sends for scatter reverse) */
2089:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flgs);
2090:     if (flgs) {
2091:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter Ssend moden");
2092:     }
2093:     for (i=0; i<from->n; i++) {
2094:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2095:                            comm,rwaits+i);
2096:       if (!flgs) {
2097:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2098:                              comm,rev_swaits+i);
2099:       } else {
2100:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,
2101:                               comm,rev_swaits+i);
2102:       }
2103:     }

2105:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rr",&flg);
2106:     if (flg) {
2107:       ctx->postrecvs           = VecScatterPostRecvs_PtoP_X;
2108:       to->use_readyreceiver    = PETSC_TRUE;
2109:       from->use_readyreceiver  = PETSC_TRUE;
2110:       for (i=0; i<to->n; i++) {
2111:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2112:                               comm,swaits+i);
2113:       }
2114:       PetscLogInfo(0,"VecScatterCreate_PtoS:Using VecScatter ready receiver moden");
2115:     } else {
2116:       ctx->postrecvs           = 0;
2117:       to->use_readyreceiver    = PETSC_FALSE;
2118:       from->use_readyreceiver  = PETSC_FALSE;
2119:       for (i=0; i<to->n; i++) {
2120:         if (!flgs) {
2121:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2122:                                comm,swaits+i);
2123:         } else {
2124:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2125:                                comm,swaits+i);
2126:         }
2127:       }
2128:     }
2129:     /* Register receives for scatter reverse */
2130:     for (i=0; i<to->n; i++) {
2131:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,
2132:                            comm,rev_rwaits+i);
2133:     }

2135:     PetscLogInfo(0,"VecScatterCreate_PtoS:Using blocksize %d scattern",bs);
2136:     switch (bs) {
2137:     case 12:
2138:       ctx->begin     = VecScatterBegin_PtoP_12;
2139:       ctx->end       = VecScatterEnd_PtoP_12;
2140:       break;
2141:     case 5:
2142:       ctx->begin     = VecScatterBegin_PtoP_5;
2143:       ctx->end       = VecScatterEnd_PtoP_5;
2144:       break;
2145:     case 4:
2146:       ctx->begin     = VecScatterBegin_PtoP_4;
2147:       ctx->end       = VecScatterEnd_PtoP_4;
2148:       break;
2149:     case 3:
2150:       ctx->begin     = VecScatterBegin_PtoP_3;
2151:       ctx->end       = VecScatterEnd_PtoP_3;
2152:       break;
2153:     case 2:
2154:       ctx->begin     = VecScatterBegin_PtoP_2;
2155:       ctx->end       = VecScatterEnd_PtoP_2;
2156:       break;
2157:     default:
2158:       SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
2159:     }
2160:   } else {
2161:     ctx->postrecvs = 0;
2162:     ctx->destroy   = VecScatterDestroy_PtoP;
2163:     ctx->begin     = VecScatterBegin_PtoP;
2164:     ctx->end       = VecScatterEnd_PtoP;
2165:     ctx->copy      = VecScatterCopy_PtoP;
2166:     ctx->view      = VecScatterView_MPI;
2167:   }

2169:   /* Check if the local scatter is actually a copy; important special case */
2170:   if (nprocslocal) {
2171:     VecScatterLocalOptimizeCopy_Private(&to->local,&from->local,bs);
2172:   }

2174:   return(0);
2175: }

2177: /* ------------------------------------------------------------------------------------*/
2178: /*
2179:          Scatter from local Seq vectors to a parallel vector. 
2180: */
2181: int VecScatterCreate_StoP(int nx,int *inidx,int ny,int *inidy,Vec yin,VecScatter ctx)
2182: {
2183:   Vec_MPI                *y = (Vec_MPI *)yin->data;
2184:   VecScatter_MPI_General *from,*to;
2185:   int                    *source,nprocslocal,*lens,rank = y->rank,*owners = yin->map->range;
2186:   int                    ierr,size = y->size,*lowner,*start;
2187:   int                    *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work;
2188:   int                    *owner,*starts,count,tag,slen;
2189:   int                    *rvalues,*svalues,base,imdex,nmax,*values,len;
2190:   PetscTruth             found;
2191:   MPI_Comm               comm = yin->comm;
2192:   MPI_Request            *send_waits,*recv_waits;
2193:   MPI_Status             recv_status,*send_status;

2196:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2197:   /*  first count number of contributors to each processor */
2198:   PetscMalloc(2*size*sizeof(int),&nprocs);
2199:   ierr   = PetscMemzero(nprocs,2*size*sizeof(int));
2200:   procs  = nprocs + size;
2201:   PetscMalloc((nx+1)*sizeof(int),&owner);
2202:   for (i=0; i<nx; i++) {
2203:     idx = inidy[i];
2204:     found = PETSC_FALSE;
2205:     for (j=0; j<size; j++) {
2206:       if (idx >= owners[j] && idx < owners[j+1]) {
2207:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2208:       }
2209:     }
2210:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2211:   }
2212:   nprocslocal  = nprocs[rank];
2213:   nprocs[rank] = procs[rank] = 0;
2214:   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

2216:   /* inform other processors of number of messages and max length*/
2217:   ierr   = PetscMalloc(2*size*sizeof(int),&work);
2218:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2219:   nmax   = work[rank];
2220:   nrecvs = work[size+rank];
2221:   ierr   = PetscFree(work);

2223:   /* post receives:   */
2224:   PetscMalloc((nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2225:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2226:   for (i=0; i<nrecvs; i++) {
2227:     MPI_Irecv(rvalues+nmax*i,nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2228:   }

2230:   /* do sends:
2231:       1) starts[i] gives the starting index in svalues for stuff going to 
2232:          the ith processor
2233:   */
2234:   PetscMalloc((nx+1)*sizeof(int),&svalues);
2235:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2236:   PetscMalloc((size+1)*sizeof(int),&starts);
2237:   starts[0]  = 0;
2238:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2239:   for (i=0; i<nx; i++) {
2240:     if (owner[i] != rank) {
2241:       svalues[starts[owner[i]]++] = inidy[i];
2242:     }
2243:   }

2245:   starts[0] = 0;
2246:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2247:   count = 0;
2248:   for (i=0; i<size; i++) {
2249:     if (procs[i]) {
2250:       MPI_Isend(svalues+starts[i],nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2251:       count++;
2252:     }
2253:   }
2254:   PetscFree(starts);

2256:   /* allocate entire send scatter context */
2257:   PetscNew(VecScatter_MPI_General,&to);
2258:   PetscMemzero(to,sizeof(VecScatter_MPI_General));
2259:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2260:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
2261:   len  = ny*(sizeof(int) + sizeof(Scalar)) + (nsends+1)*sizeof(int) +
2262:         nsends*(sizeof(int) + sizeof(MPI_Request));
2263:   to->n        = nsends;
2264:   PetscMalloc(len,&to->values);
2265:   PetscLogObjectMemory(ctx,len);
2266:   to->requests = (MPI_Request*)(to->values + ny);
2267:   to->indices  = (int*)(to->requests + nsends);
2268:   to->starts   = (int*)(to->indices + ny);
2269:   to->procs    = (int*)(to->starts + nsends + 1);
2270:   ierr         = PetscMalloc((PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status),&to->sstatus);
2271:   to->rstatus  = to->sstatus + PetscMax(nsends,nrecvs) + 1;
2272:   PetscLogObjectMemory(ctx,2*(PetscMax(nsends,nrecvs) + 1)*sizeof(MPI_Status));
2273:   ctx->todata  = (void*)to;

2275:   /* move data into send scatter context */
2276:   PetscMalloc((size+nsends+1)*sizeof(int),&lowner);
2277:   start         = lowner + size;
2278:   count         = 0;
2279:   to->starts[0] = start[0] = 0;
2280:   for (i=0; i<size; i++) {
2281:     if (procs[i]) {
2282:       lowner[i]          = count;
2283:       to->procs[count++] = i;
2284:       to->starts[count]  = start[count] = start[count-1] + nprocs[i];
2285:     }
2286:   }
2287:   for (i=0; i<nx; i++) {
2288:     if (owner[i] != rank) {
2289:       to->indices[start[lowner[owner[i]]]++] = inidx[i];
2290:     }
2291:   }
2292:   PetscFree(lowner);
2293:   PetscFree(owner);
2294:   PetscFree(nprocs);

2296:   base = owners[rank];

2298:   /*  wait on receives */
2299:   ierr   = PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2300:   source = lens + nrecvs;
2301:   count  = nrecvs; slen = 0;
2302:   while (count) {
2303:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2304:     /* unpack receives into our local space */
2305:     MPI_Get_count(&recv_status,MPI_INT,&n);
2306:     source[imdex]  = recv_status.MPI_SOURCE;
2307:     lens[imdex]    = n;
2308:     slen          += n;
2309:     count--;
2310:   }
2311:   PetscFree(recv_waits);
2312: 
2313:   /* allocate entire receive scatter context */
2314:   PetscNew(VecScatter_MPI_General,&from);
2315:   PetscMemzero(from,sizeof(VecScatter_MPI_General));
2316:   PetscLogObjectMemory(ctx,sizeof(VecScatter_MPI_General));
2317:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&from->sendfirst);
2318:   len  = slen*(sizeof(int) + sizeof(Scalar)) + (nrecvs+1)*sizeof(int) +
2319:           nrecvs*(sizeof(int) + sizeof(MPI_Request));
2320:   from->n        = nrecvs;
2321:   PetscMalloc(len,&from->values);
2322:   PetscLogObjectMemory(ctx,len);
2323:   from->requests = (MPI_Request*)(from->values + slen);
2324:   from->indices  = (int*)(from->requests + nrecvs);
2325:   from->starts   = (int*)(from->indices + slen);
2326:   from->procs    = (int*)(from->starts + nrecvs + 1);
2327:   ctx->fromdata  = (void*)from;

2329:   /* move the data into the receive scatter context*/
2330:   from->starts[0] = 0;
2331:   for (i=0; i<nrecvs; i++) {
2332:     from->starts[i+1] = from->starts[i] + lens[i];
2333:     from->procs[i]    = source[i];
2334:     values            = rvalues + i*nmax;
2335:     for (j=0; j<lens[i]; j++) {
2336:       from->indices[from->starts[i] + j] = values[j] - base;
2337:     }
2338:   }
2339:   PetscFree(rvalues);
2340:   PetscFree(lens);
2341: 
2342:   /* wait on sends */
2343:   if (nsends) {
2344:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2345:     MPI_Waitall(nsends,send_waits,send_status);
2346:     PetscFree(send_status);
2347:   }
2348:   PetscFree(send_waits);
2349:   PetscFree(svalues);

2351:   if (nprocslocal) {
2352:     int nt;
2353:     /* we have a scatter to ourselves */
2354:     from->local.n     = to->local.n = nt = nprocslocal;
2355:     PetscMalloc(nt*sizeof(int),&from->local.slots);
2356:     PetscMalloc(nt*sizeof(int),&to->local.slots);
2357:     PetscLogObjectMemory(ctx,2*nt*sizeof(int));
2358:     nt = 0;
2359:     for (i=0; i<ny; i++) {
2360:       idx = inidy[i];
2361:       if (idx >= owners[rank] && idx < owners[rank+1]) {
2362:         from->local.slots[nt] = idx - owners[rank];
2363:         to->local.slots[nt++] = inidx[i];
2364:       }
2365:     }
2366:   } else {
2367:     from->local.n     = 0;
2368:     from->local.slots = 0;
2369:     to->local.n       = 0;
2370:     to->local.slots   = 0;

2372:   }
2373:   from->local.nonmatching_computed = PETSC_FALSE;
2374:   from->local.n_nonmatching        = 0;
2375:   from->local.slots_nonmatching    = 0;
2376:   to->local.nonmatching_computed   = PETSC_FALSE;
2377:   to->local.n_nonmatching          = 0;
2378:   to->local.slots_nonmatching      = 0;

2380:   to->type   = VEC_SCATTER_MPI_GENERAL;
2381:   from->type = VEC_SCATTER_MPI_GENERAL;

2383:   ctx->destroy    = VecScatterDestroy_PtoP;
2384:   ctx->postrecvs  = 0;
2385:   ctx->begin      = VecScatterBegin_PtoP;
2386:   ctx->end        = VecScatterEnd_PtoP;
2387:   ctx->copy       = 0;
2388:   ctx->view       = VecScatterView_MPI;

2390:   to->bs   = 1;
2391:   from->bs = 1;
2392:   return(0);
2393: }

2395: /* ---------------------------------------------------------------------------------*/
2396: int VecScatterCreate_PtoP(int nx,int *inidx,int ny,int *inidy,Vec xin,Vec yin,VecScatter ctx)
2397: {
2398:   int         *lens,rank,*owners = xin->map->range,size;
2399:   int         *nprocs,i,j,n,idx,*procs,nsends,nrecvs,*work,*local_inidx,*local_inidy;
2400:   int         *owner,*starts,count,tag,slen,ierr;
2401:   int         *rvalues,*svalues,base,imdex,nmax,*values;
2402:   MPI_Comm    comm;
2403:   MPI_Request *send_waits,*recv_waits;
2404:   MPI_Status  recv_status;
2405:   PetscTruth  duplicate = PETSC_FALSE,found;

2408:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2409:   PetscObjectGetComm((PetscObject)xin,&comm);
2410:   MPI_Comm_size(comm,&size);
2411:   MPI_Comm_rank(comm,&rank);
2412:   if (size == 1) {
2413:     VecScatterCreate_StoP(nx,inidx,ny,inidy,yin,ctx);
2414:     return(0);
2415:   }

2417:   /*
2418:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2419:      They then call the StoPScatterCreate()
2420:   */
2421:   /*  first count number of contributors to each processor */
2422:   ierr  = PetscMalloc(2*size*sizeof(int),&nprocs);
2423:   procs = nprocs + size;
2424:   ierr  = PetscMemzero(nprocs,2*size*sizeof(int));
2425:   ierr  = PetscMalloc((nx+1)*sizeof(int),&owner);
2426:   for (i=0; i<nx; i++) {
2427:     idx   = inidx[i];
2428:     found = PETSC_FALSE;
2429:     for (j=0; j<size; j++) {
2430:       if (idx >= owners[j] && idx < owners[j+1]) {
2431:         nprocs[j]++; procs[j] = 1; owner[i] = j; found = PETSC_TRUE; break;
2432:       }
2433:     }
2434:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %d out of range",idx);
2435:   }
2436:   nsends = 0;  for (i=0; i<size; i++) { nsends += procs[i];}

2438:   /* inform other processors of number of messages and max length*/
2439:   PetscMalloc(2*size*sizeof(int),&work);
2440:   ierr   = MPI_Allreduce(nprocs,work,2*size,MPI_INT,PetscMaxSum_Op,comm);
2441:   nmax   = work[rank];
2442:   nrecvs = work[size+rank];
2443:   ierr   = PetscFree(work);

2445:   /* post receives:   */
2446:   PetscMalloc(2*(nrecvs+1)*(nmax+1)*sizeof(int),&rvalues);
2447:   PetscMalloc((nrecvs+1)*sizeof(MPI_Request),&recv_waits);
2448:   for (i=0; i<nrecvs; i++) {
2449:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPI_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2450:   }

2452:   /* do sends:
2453:       1) starts[i] gives the starting index in svalues for stuff going to 
2454:          the ith processor
2455:   */
2456:   PetscMalloc(2*(nx+1)*sizeof(int),&svalues);
2457:   PetscMalloc((nsends+1)*sizeof(MPI_Request),&send_waits);
2458:   PetscMalloc((size+1)*sizeof(int),&starts);
2459:   starts[0]  = 0;
2460:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2461:   for (i=0; i<nx; i++) {
2462:     svalues[2*starts[owner[i]]]       = inidx[i];
2463:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2464:   }
2465:   PetscFree(owner);

2467:   starts[0] = 0;
2468:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[i-1];}
2469:   count = 0;
2470:   for (i=0; i<size; i++) {
2471:     if (procs[i]) {
2472:       MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPI_INT,i,tag,comm,send_waits+count);
2473:       count++;
2474:     }
2475:   }
2476:   PetscFree(starts);
2477:   PetscFree(nprocs);

2479:   base = owners[rank];

2481:   /*  wait on receives */
2482:   PetscMalloc(2*(nrecvs+1)*sizeof(int),&lens);
2483:   count = nrecvs;
2484:   slen  = 0;
2485:   while (count) {
2486:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2487:     /* unpack receives into our local space */
2488:     MPI_Get_count(&recv_status,MPI_INT,&n);
2489:     lens[imdex]  =  n/2;
2490:     slen         += n/2;
2491:     count--;
2492:   }
2493:   PetscFree(recv_waits);
2494: 
2495:   PetscMalloc(2*(slen+1)*sizeof(int),&local_inidx);
2496:   local_inidy = local_inidx + slen;

2498:   count = 0;
2499:   for (i=0; i<nrecvs; i++) {
2500:     values = rvalues + 2*i*nmax;
2501:     for (j=0; j<lens[i]; j++) {
2502:       local_inidx[count]   = values[2*j] - base;
2503:       local_inidy[count++] = values[2*j+1];
2504:     }
2505:   }
2506:   PetscFree(rvalues);
2507:   PetscFree(lens);

2509:   /* wait on sends */
2510:   if (nsends) {
2511:     MPI_Status *send_status;
2512:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
2513:     MPI_Waitall(nsends,send_waits,send_status);
2514:     PetscFree(send_status);
2515:   }
2516:   PetscFree(send_waits);
2517:   PetscFree(svalues);

2519:   /*
2520:      should sort and remove duplicates from local_inidx,local_inidy 
2521:   */

2523: #if defined(do_it_slow)
2524:   /* sort on the from index */
2525:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2526:   start = 0;
2527:   while (start < slen) {
2528:     count = start+1;
2529:     last  = local_inidx[start];
2530:     while (count < slen && last == local_inidx[count]) count++;
2531:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2532:       /* sort on to index */
2533:       PetscSortInt(count-start,local_inidy+start);
2534:     }
2535:     /* remove duplicates; not most efficient way, but probably good enough */
2536:     i = start;
2537:     while (i < count-1) {
2538:       if (local_inidy[i] != local_inidy[i+1]) {
2539:         i++;
2540:       } else { /* found a duplicate */
2541:         duplicate = PETSC_TRUE;
2542:         for (j=i; j<slen-1; j++) {
2543:           local_inidx[j] = local_inidx[j+1];
2544:           local_inidy[j] = local_inidy[j+1];
2545:         }
2546:         slen--;
2547:         count--;
2548:         /* printf("found dup %d %dn",local_inidx[i],local_inidy[i]);*/
2549:       }
2550:     }
2551:     start = count;
2552:   }
2553: #endif
2554:   if (duplicate) {
2555:     PetscLogInfo(0,"VecScatterCreate_PtoP:Duplicate to from indices passed in VecScatterCreate(), they are ignoredn");
2556:   }
2557:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,yin,ctx);
2558:   PetscFree(local_inidx);
2559:   return(0);
2560: }