Actual source code: vpscat.c

  1: #define PETSCVEC_DLL

  3: /*
  4:     Defines parallel vector scatters.
  5: */

 7:  #include include/private/isimpl.h
 8:  #include include/private/vecimpl.h
 9:  #include src/vec/vec/impls/dvecimpl.h
 10:  #include src/vec/vec/impls/mpi/pvecimpl.h
 11:  #include petscsys.h

 15: PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
 16: {
 17:   VecScatter_MPI_General *to=(VecScatter_MPI_General*)ctx->todata;
 18:   VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
 19:   PetscErrorCode         ierr;
 20:   PetscInt               i;
 21:   PetscMPIInt            rank;
 22:   PetscViewerFormat      format;
 23:   PetscTruth             iascii;

 26:   PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);
 27:   if (iascii) {
 28:     MPI_Comm_rank(((PetscObject)ctx)->comm,&rank);
 29:     PetscViewerGetFormat(viewer,&format);
 30:     if (format ==  PETSC_VIEWER_ASCII_INFO) {
 31:       PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;

 33:       MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 34:       MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 35:       itmp = to->starts[to->n+1];
 36:       MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 37:       itmp = from->starts[from->n+1];
 38:       MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,((PetscObject)ctx)->comm);
 39:       MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,((PetscObject)ctx)->comm);

 41:       PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");
 42:       PetscViewerASCIIPrintf(viewer,"  Maximum number sends %D\n",nsend_max);
 43:       PetscViewerASCIIPrintf(viewer,"  Maximum number receives %D\n",nrecv_max);
 44:       PetscViewerASCIIPrintf(viewer,"  Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));
 45:       PetscViewerASCIIPrintf(viewer,"  Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));
 46:       PetscViewerASCIIPrintf(viewer,"  Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));

 48:     } else {
 49:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);
 50:       if (to->n) {
 51:         for (i=0; i<to->n; i++){
 52:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]   %D length = %D to whom %D\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);
 53:         }
 54:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote sends (in order by process sent to)\n");
 55:         for (i=0; i<to->starts[to->n]; i++){
 56:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,to->indices[i]);
 57:         }
 58:       }

 60:       PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);
 61:       if (from->n) {
 62:         for (i=0; i<from->n; i++){
 63:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %D\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);
 64:         }

 66:         PetscViewerASCIISynchronizedPrintf(viewer,"Now the indices for all remote receives (in order by process received from)\n");
 67:         for (i=0; i<from->starts[from->n]; i++){
 68:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]%D \n",rank,from->indices[i]);
 69:         }
 70:       }
 71:       if (to->local.n) {
 72:         PetscViewerASCIISynchronizedPrintf(viewer,"[%d]Indices for local part of scatter\n",rank);
 73:         for (i=0; i<to->local.n; i++){
 74:           PetscViewerASCIISynchronizedPrintf(viewer,"[%d]From %D to %D \n",rank,from->local.vslots[i],to->local.vslots[i]);
 75:         }
 76:       }

 78:       PetscViewerFlush(viewer);
 79:     }
 80:   } else {
 81:     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for this scatter",((PetscObject)viewer)->type_name);
 82:   }
 83:   return(0);
 84: }

 86: /* -----------------------------------------------------------------------------------*/
 87: /*
 88:       The next routine determines what part of  the local part of the scatter is an
 89:   exact copy of values into their current location. We check this here and
 90:   then know that we need not perform that portion of the scatter when the vector is
 91:   scattering to itself with INSERT_VALUES.

 93:      This is currently not used but would speed up, for example DALocalToLocalBegin/End()

 95: */
 98: PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
 99: {
100:   PetscInt       n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
102:   PetscInt       *nto_slots,*nfrom_slots,j = 0;
103: 
105:   for (i=0; i<n; i++) {
106:     if (to_slots[i] != from_slots[i]) n_nonmatching++;
107:   }

109:   if (!n_nonmatching) {
110:     to->nonmatching_computed = PETSC_TRUE;
111:     to->n_nonmatching        = from->n_nonmatching = 0;
112:     PetscInfo1(scatter,"Reduced %D to 0\n", n);
113:   } else if (n_nonmatching == n) {
114:     to->nonmatching_computed = PETSC_FALSE;
115:     PetscInfo(scatter,"All values non-matching\n");
116:   } else {
117:     to->nonmatching_computed= PETSC_TRUE;
118:     to->n_nonmatching       = from->n_nonmatching = n_nonmatching;
119:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nto_slots);
120:     PetscMalloc(n_nonmatching*sizeof(PetscInt),&nfrom_slots);
121:     to->slots_nonmatching   = nto_slots;
122:     from->slots_nonmatching = nfrom_slots;
123:     for (i=0; i<n; i++) {
124:       if (to_slots[i] != from_slots[i]) {
125:         nto_slots[j]   = to_slots[i];
126:         nfrom_slots[j] = from_slots[i];
127:         j++;
128:       }
129:     }
130:     PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);
131:   }
132:   return(0);
133: }

135: /* --------------------------------------------------------------------------------------*/

137: /* -------------------------------------------------------------------------------------*/
140: PetscErrorCode VecScatterDestroy_PtoP(VecScatter ctx)
141: {
142:   VecScatter_MPI_General *to   = (VecScatter_MPI_General*)ctx->todata;
143:   VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
144:   PetscErrorCode         ierr;
145:   PetscInt               i;

148:   if (to->use_readyreceiver) {
149:     /*
150:        Since we have already posted sends we must cancel them before freeing 
151:        the requests
152:     */
153:     for (i=0; i<from->n; i++) {
154:       MPI_Cancel(from->requests+i);
155:     }
156:     for (i=0; i<to->n; i++) {
157:       MPI_Cancel(to->rev_requests+i);
158:     }
159:   }

161: #if defined(PETSC_HAVE_MPI_ALLTOALLW)
162:   if (to->use_alltoallw) {
163:     PetscFree3(to->wcounts,to->wdispls,to->types);
164:     PetscFree3(from->wcounts,from->wdispls,from->types);
165:   }
166: #endif

168: #if defined(PETSC_HAVE_MPI_WINDOW)
169:   if (to->use_window) {
170:     MPI_Win_free(&from->window);
171:     MPI_Win_free(&to->window);
172:   }
173: #endif

175:   if (to->use_alltoallv) {
176:     PetscFree2(to->counts,to->displs);
177:     PetscFree2(from->counts,from->displs);
178:   }

180:   /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
181:   /* 
182:      IBM's PE version of MPI has a bug where freeing these guys will screw up later
183:      message passing.
184:   */
185: #if !defined(PETSC_HAVE_BROKEN_REQUEST_FREE)
186:   if (!to->use_alltoallv) {   /* currently the to->requests etc are ALWAYS allocated even if not used */
187:     if (to->requests) {
188:       for (i=0; i<to->n; i++) {
189:         MPI_Request_free(to->requests + i);
190:       }
191:     }
192:     if (to->rev_requests) {
193:       for (i=0; i<to->n; i++) {
194:         MPI_Request_free(to->rev_requests + i);
195:       }
196:     }
197:   }
198:   /*
199:       MPICH could not properly cancel requests thus with ready receiver mode we
200:     cannot free the requests. It may be fixed now, if not then put the following 
201:     code inside a if !to->use_readyreceiver) {
202:   */
203:   if (!to->use_alltoallv) {    /* currently the from->requests etc are ALWAYS allocated even if not used */
204:     if (from->requests) {
205:       for (i=0; i<from->n; i++) {
206:         MPI_Request_free(from->requests + i);
207:       }
208:     }

210:     if (from->rev_requests) {
211:       for (i=0; i<from->n; i++) {
212:         MPI_Request_free(from->rev_requests + i);
213:       }
214:     }
215:   }
216: #endif

218:   PetscFree(to->local.vslots);
219:   PetscFree(from->local.vslots);
220:   PetscFree2(to->counts,to->displs);
221:   PetscFree2(from->counts,from->displs);
222:   PetscFree(to->local.slots_nonmatching);
223:   PetscFree(from->local.slots_nonmatching);
224:   PetscFree(to->rev_requests);
225:   PetscFree(from->rev_requests);
226:   PetscFree(to->requests);
227:   PetscFree(from->requests);
228:   PetscFree4(to->values,to->indices,to->starts,to->procs);
229:   PetscFree2(to->sstatus,to->rstatus);
230:   PetscFree4(from->values,from->indices,from->starts,from->procs);
231:   PetscFree(from);
232:   PetscFree(to);
233:   PetscHeaderDestroy(ctx);
234:   return(0);
235: }



239: /* --------------------------------------------------------------------------------------*/
240: /*
241:     Special optimization to see if the local part of the scatter is actually 
242:     a copy. The scatter routines call PetscMemcpy() instead.
243:  
244: */
247: PetscErrorCode VecScatterLocalOptimizeCopy_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from,PetscInt bs)
248: {
249:   PetscInt       n = to->n,i,*to_slots = to->vslots,*from_slots = from->vslots;
250:   PetscInt       to_start,from_start;

254:   to_start   = to_slots[0];
255:   from_start = from_slots[0];

257:   for (i=1; i<n; i++) {
258:     to_start   += bs;
259:     from_start += bs;
260:     if (to_slots[i]   != to_start)   return(0);
261:     if (from_slots[i] != from_start) return(0);
262:   }
263:   to->is_copy       = PETSC_TRUE;
264:   to->copy_start    = to_slots[0];
265:   to->copy_length   = bs*sizeof(PetscScalar)*n;
266:   from->is_copy     = PETSC_TRUE;
267:   from->copy_start  = from_slots[0];
268:   from->copy_length = bs*sizeof(PetscScalar)*n;
269:   PetscInfo(scatter,"Local scatter is a copy, optimizing for it\n");
270:   return(0);
271: }

273: /* --------------------------------------------------------------------------------------*/

277: PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
278: {
279:   VecScatter_MPI_General *in_to   = (VecScatter_MPI_General*)in->todata;
280:   VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
281:   PetscErrorCode         ierr;
282:   PetscInt               ny,bs = in_from->bs;

285:   out->begin     = in->begin;
286:   out->end       = in->end;
287:   out->copy      = in->copy;
288:   out->destroy   = in->destroy;
289:   out->view      = in->view;

291:   /* allocate entire send scatter context */
292:   PetscNewLog(out,VecScatter_MPI_General,&out_to);
293:   PetscNewLog(out,VecScatter_MPI_General,&out_from);

295:   ny                = in_to->starts[in_to->n];
296:   out_to->n         = in_to->n;
297:   out_to->type      = in_to->type;
298:   out_to->sendfirst = in_to->sendfirst;

300:   PetscMalloc(out_to->n*sizeof(MPI_Request),&out_to->requests);
301:   PetscMalloc4(bs*ny,PetscScalar,&out_to->values,ny,PetscInt,&out_to->indices,out_to->n+1,PetscInt,&out_to->starts,
302:                       out_to->n,PetscMPIInt,&out_to->procs);
303:   PetscMalloc2(PetscMax(in_to->n,in_from->n),MPI_Status,&out_to->sstatus,PetscMax(in_to->n,in_from->n),MPI_Status,
304:                       &out_to->rstatus);
305:   PetscMemcpy(out_to->indices,in_to->indices,ny*sizeof(PetscInt));
306:   PetscMemcpy(out_to->starts,in_to->starts,(out_to->n+1)*sizeof(PetscInt));
307:   PetscMemcpy(out_to->procs,in_to->procs,(out_to->n)*sizeof(PetscMPIInt));
308: 
309:   out->todata       = (void*)out_to;
310:   out_to->local.n   = in_to->local.n;
311:   out_to->local.nonmatching_computed = PETSC_FALSE;
312:   out_to->local.n_nonmatching        = 0;
313:   out_to->local.slots_nonmatching    = 0;
314:   if (in_to->local.n) {
315:     PetscMalloc(in_to->local.n*sizeof(PetscInt),&out_to->local.vslots);
316:     PetscMalloc(in_from->local.n*sizeof(PetscInt),&out_from->local.vslots);
317:     PetscMemcpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n*sizeof(PetscInt));
318:     PetscMemcpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n*sizeof(PetscInt));
319:   } else {
320:     out_to->local.vslots   = 0;
321:     out_from->local.vslots = 0;
322:   }

324:   /* allocate entire receive context */
325:   out_from->type      = in_from->type;
326:   ny                  = in_from->starts[in_from->n];
327:   out_from->n         = in_from->n;
328:   out_from->sendfirst = in_from->sendfirst;

330:   PetscMalloc(out_from->n*sizeof(MPI_Request),&out_from->requests);
331:   PetscMalloc4(ny*bs,PetscScalar,&out_from->values,ny,PetscInt,&out_from->indices,
332:                       out_from->n+1,PetscInt,&out_from->starts,out_from->n,PetscMPIInt,&out_from->procs);
333:   PetscMemcpy(out_from->indices,in_from->indices,ny*sizeof(PetscInt));
334:   PetscMemcpy(out_from->starts,in_from->starts,(out_from->n+1)*sizeof(PetscInt));
335:   PetscMemcpy(out_from->procs,in_from->procs,(out_from->n)*sizeof(PetscMPIInt));
336:   out->fromdata       = (void*)out_from;
337:   out_from->local.n   = in_from->local.n;
338:   out_from->local.nonmatching_computed = PETSC_FALSE;
339:   out_from->local.n_nonmatching        = 0;
340:   out_from->local.slots_nonmatching    = 0;

342:   /* 
343:       set up the request arrays for use with isend_init() and irecv_init()
344:   */
345:   {
346:     PetscMPIInt tag;
347:     MPI_Comm    comm;
348:     PetscInt    *sstarts = out_to->starts,  *rstarts = out_from->starts;
349:     PetscMPIInt *sprocs  = out_to->procs,   *rprocs  = out_from->procs;
350:     PetscInt    i;
351:     PetscTruth  flg;
352:     MPI_Request *swaits  = out_to->requests,*rwaits  = out_from->requests;
353:     MPI_Request *rev_swaits,*rev_rwaits;
354:     PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;

356:     PetscMalloc(in_to->n*sizeof(MPI_Request),&out_to->rev_requests);
357:     PetscMalloc(in_from->n*sizeof(MPI_Request),&out_from->rev_requests);

359:     rev_rwaits = out_to->rev_requests;
360:     rev_swaits = out_from->rev_requests;

362:     out_from->bs = out_to->bs = bs;
363:     tag     = ((PetscObject)out)->tag;
364:     comm    = ((PetscObject)out)->comm;

366:     /* Register the receives that you will use later (sends for scatter reverse) */
367:     for (i=0; i<out_from->n; i++) {
368:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
369:       MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);
370:     }

372:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&flg);
373:     if (flg) {
374:       out_to->use_readyreceiver    = PETSC_TRUE;
375:       out_from->use_readyreceiver  = PETSC_TRUE;
376:       for (i=0; i<out_to->n; i++) {
377:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
378:       }
379:       if (out_from->n) {MPI_Startall_irecv(out_from->starts[out_from->n]*out_from->bs,out_from->n,out_from->requests);}
380:       MPI_Barrier(comm);
381:       PetscInfo(in,"Using VecScatter ready receiver mode\n");
382:     } else {
383:       out_to->use_readyreceiver    = PETSC_FALSE;
384:       out_from->use_readyreceiver  = PETSC_FALSE;
385:       PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&flg);
386:       if (flg) {
387:         PetscInfo(in,"Using VecScatter Ssend mode\n");
388:       }
389:       for (i=0; i<out_to->n; i++) {
390:         if (!flg) {
391:           MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
392:         } else {
393:           MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
394:         }
395:       }
396:     }
397:     /* Register receives for scatter reverse */
398:     for (i=0; i<out_to->n; i++) {
399:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);
400:     }
401:   }

403:   return(0);
404: }
405: /* --------------------------------------------------------------------------------------------------
406:     Packs and unpacks the message data into send or from receive buffers. 

408:     These could be generated automatically. 

410:     Fortran kernels etc. could be used.
411: */
412: PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
413: {
414:   PetscInt i;
415:   for (i=0; i<n; i++) {
416:     y[i]  = x[indicesx[i]];
417:   }
418: }
419: PETSC_STATIC_INLINE void UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
420: {
421:   PetscInt i;
422:   switch (addv) {
423:   case INSERT_VALUES:
424:     for (i=0; i<n; i++) {
425:       y[indicesy[i]] = x[i];
426:     }
427:     break;
428:   case ADD_VALUES:
429:     for (i=0; i<n; i++) {
430:       y[indicesy[i]] += x[i];
431:     }
432:     break;
433: #if !defined(PETSC_USE_COMPLEX)
434:   case MAX_VALUES:
435:     for (i=0; i<n; i++) {
436:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
437:     }
438: #else
439:   case MAX_VALUES:
440: #endif
441:   case NOT_SET_VALUES:
442:     break;
443:   }
444: }

446: PETSC_STATIC_INLINE void Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
447: {
448:   PetscInt i;
449:   switch (addv) {
450:   case INSERT_VALUES:
451:     for (i=0; i<n; i++) {
452:       y[indicesy[i]] = x[indicesx[i]];
453:     }
454:     break;
455:   case ADD_VALUES:
456:     for (i=0; i<n; i++) {
457:       y[indicesy[i]] += x[indicesx[i]];
458:     }
459:     break;
460: #if !defined(PETSC_USE_COMPLEX)
461:   case MAX_VALUES:
462:     for (i=0; i<n; i++) {
463:       y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
464:     }
465: #else
466:   case MAX_VALUES:
467: #endif
468:   case NOT_SET_VALUES:
469:     break;
470:   }
471: }

473:   /* ----------------------------------------------------------------------------------------------- */
474: PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
475: {
476:   PetscInt i,idx;

478:   for (i=0; i<n; i++) {
479:     idx   = *indicesx++;
480:     y[0]  = x[idx];
481:     y[1]  = x[idx+1];
482:     y    += 2;
483:   }
484: }
485: PETSC_STATIC_INLINE void UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
486: {
487:   PetscInt i,idy;

489:   switch (addv) {
490:   case INSERT_VALUES:
491:     for (i=0; i<n; i++) {
492:       idy       = *indicesy++;
493:       y[idy]    = x[0];
494:       y[idy+1]  = x[1];
495:       x    += 2;
496:     }
497:     break;
498:   case ADD_VALUES:
499:     for (i=0; i<n; i++) {
500:       idy       = *indicesy++;
501:       y[idy]    += x[0];
502:       y[idy+1]  += x[1];
503:       x    += 2;
504:     }
505:     break;
506: #if !defined(PETSC_USE_COMPLEX)
507:   case MAX_VALUES:
508:     for (i=0; i<n; i++) {
509:       idy       = *indicesy++;
510:       y[idy]    = PetscMax(y[idy],x[0]);
511:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
512:       x    += 2;
513:     }
514: #else
515:   case MAX_VALUES:
516: #endif
517:   case NOT_SET_VALUES:
518:     break;
519:   }
520: }

522: PETSC_STATIC_INLINE void Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
523: {
524:   PetscInt i,idx,idy;

526:   switch (addv) {
527:   case INSERT_VALUES:
528:     for (i=0; i<n; i++) {
529:       idx       = *indicesx++;
530:       idy       = *indicesy++;
531:       y[idy]    = x[idx];
532:       y[idy+1]  = x[idx+1];
533:     }
534:     break;
535:   case ADD_VALUES:
536:     for (i=0; i<n; i++) {
537:       idx       = *indicesx++;
538:       idy       = *indicesy++;
539:       y[idy]    += x[idx];
540:       y[idy+1]  += x[idx+1];
541:     }
542:     break;
543: #if !defined(PETSC_USE_COMPLEX)
544:   case MAX_VALUES:
545:     for (i=0; i<n; i++) {
546:       idx       = *indicesx++;
547:       idy       = *indicesy++;
548:       y[idy]    = PetscMax(y[idy],x[idx]);
549:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
550:     }
551: #else
552:   case MAX_VALUES:
553: #endif
554:   case NOT_SET_VALUES:
555:     break;
556:   }
557: }
558:   /* ----------------------------------------------------------------------------------------------- */
559: PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
560: {
561:   PetscInt i,idx;

563:   for (i=0; i<n; i++) {
564:     idx   = *indicesx++;
565:     y[0]  = x[idx];
566:     y[1]  = x[idx+1];
567:     y[2]  = x[idx+2];
568:     y    += 3;
569:   }
570: }
571: PETSC_STATIC_INLINE void UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
572: {
573:   PetscInt i,idy;

575:   switch (addv) {
576:   case INSERT_VALUES:
577:     for (i=0; i<n; i++) {
578:       idy       = *indicesy++;
579:       y[idy]    = x[0];
580:       y[idy+1]  = x[1];
581:       y[idy+2]  = x[2];
582:       x    += 3;
583:     }
584:     break;
585:   case ADD_VALUES:
586:     for (i=0; i<n; i++) {
587:       idy       = *indicesy++;
588:       y[idy]    += x[0];
589:       y[idy+1]  += x[1];
590:       y[idy+2]  += x[2];
591:       x    += 3;
592:     }
593:     break;
594: #if !defined(PETSC_USE_COMPLEX)
595:   case MAX_VALUES:
596:     for (i=0; i<n; i++) {
597:       idy       = *indicesy++;
598:       y[idy]    = PetscMax(y[idy],x[0]);
599:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
600:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
601:       x    += 3;
602:     }
603: #else
604:   case MAX_VALUES:
605: #endif
606:   case NOT_SET_VALUES:
607:     break;
608:   }
609: }

611: PETSC_STATIC_INLINE void Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
612: {
613:   PetscInt i,idx,idy;

615:   switch (addv) {
616:   case INSERT_VALUES:
617:     for (i=0; i<n; i++) {
618:       idx       = *indicesx++;
619:       idy       = *indicesy++;
620:       y[idy]    = x[idx];
621:       y[idy+1]  = x[idx+1];
622:       y[idy+2]  = x[idx+2];
623:     }
624:     break;
625:   case ADD_VALUES:
626:     for (i=0; i<n; i++) {
627:       idx       = *indicesx++;
628:       idy       = *indicesy++;
629:       y[idy]    += x[idx];
630:       y[idy+1]  += x[idx+1];
631:       y[idy+2]  += x[idx+2];
632:     }
633:     break;
634: #if !defined(PETSC_USE_COMPLEX)
635:   case MAX_VALUES:
636:     for (i=0; i<n; i++) {
637:       idx       = *indicesx++;
638:       idy       = *indicesy++;
639:       y[idy]    = PetscMax(y[idy],x[idx]);
640:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
641:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
642:     }
643: #else
644:   case MAX_VALUES:
645: #endif
646:   case NOT_SET_VALUES:
647:     break;
648:   }
649: }
650:   /* ----------------------------------------------------------------------------------------------- */
651: PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
652: {
653:   PetscInt i,idx;

655:   for (i=0; i<n; i++) {
656:     idx   = *indicesx++;
657:     y[0]  = x[idx];
658:     y[1]  = x[idx+1];
659:     y[2]  = x[idx+2];
660:     y[3]  = x[idx+3];
661:     y    += 4;
662:   }
663: }
664: PETSC_STATIC_INLINE void UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
665: {
666:   PetscInt i,idy;

668:   switch (addv) {
669:   case INSERT_VALUES:
670:     for (i=0; i<n; i++) {
671:       idy       = *indicesy++;
672:       y[idy]    = x[0];
673:       y[idy+1]  = x[1];
674:       y[idy+2]  = x[2];
675:       y[idy+3]  = x[3];
676:       x    += 4;
677:     }
678:     break;
679:   case ADD_VALUES:
680:     for (i=0; i<n; i++) {
681:       idy       = *indicesy++;
682:       y[idy]    += x[0];
683:       y[idy+1]  += x[1];
684:       y[idy+2]  += x[2];
685:       y[idy+3]  += x[3];
686:       x    += 4;
687:     }
688:     break;
689: #if !defined(PETSC_USE_COMPLEX)
690:   case MAX_VALUES:
691:     for (i=0; i<n; i++) {
692:       idy       = *indicesy++;
693:       y[idy]    = PetscMax(y[idy],x[0]);
694:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
695:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
696:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
697:       x    += 4;
698:     }
699: #else
700:   case MAX_VALUES:
701: #endif
702:   case NOT_SET_VALUES:
703:     break;
704:   }
705: }

707: PETSC_STATIC_INLINE void Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
708: {
709:   PetscInt i,idx,idy;

711:   switch (addv) {
712:   case INSERT_VALUES:
713:     for (i=0; i<n; i++) {
714:       idx       = *indicesx++;
715:       idy       = *indicesy++;
716:       y[idy]    = x[idx];
717:       y[idy+1]  = x[idx+1];
718:       y[idy+2]  = x[idx+2];
719:       y[idy+3]  = x[idx+3];
720:     }
721:     break;
722:   case ADD_VALUES:
723:     for (i=0; i<n; i++) {
724:       idx       = *indicesx++;
725:       idy       = *indicesy++;
726:       y[idy]    += x[idx];
727:       y[idy+1]  += x[idx+1];
728:       y[idy+2]  += x[idx+2];
729:       y[idy+3]  += x[idx+3];
730:     }
731:     break;
732: #if !defined(PETSC_USE_COMPLEX)
733:   case MAX_VALUES:
734:     for (i=0; i<n; i++) {
735:       idx       = *indicesx++;
736:       idy       = *indicesy++;
737:       y[idy]    = PetscMax(y[idy],x[idx]);
738:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
739:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
740:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
741:     }
742: #else
743:   case MAX_VALUES:
744: #endif
745:   case NOT_SET_VALUES:
746:     break;
747:   }
748: }
749:   /* ----------------------------------------------------------------------------------------------- */
750: PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
751: {
752:   PetscInt i,idx;

754:   for (i=0; i<n; i++) {
755:     idx   = *indicesx++;
756:     y[0]  = x[idx];
757:     y[1]  = x[idx+1];
758:     y[2]  = x[idx+2];
759:     y[3]  = x[idx+3];
760:     y[4]  = x[idx+4];
761:     y    += 5;
762:   }
763: }
764: PETSC_STATIC_INLINE void UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
765: {
766:   PetscInt i,idy;

768:   switch (addv) {
769:   case INSERT_VALUES:
770:     for (i=0; i<n; i++) {
771:       idy       = *indicesy++;
772:       y[idy]    = x[0];
773:       y[idy+1]  = x[1];
774:       y[idy+2]  = x[2];
775:       y[idy+3]  = x[3];
776:       y[idy+4]  = x[4];
777:       x    += 5;
778:     }
779:     break;
780:   case ADD_VALUES:
781:     for (i=0; i<n; i++) {
782:       idy       = *indicesy++;
783:       y[idy]    += x[0];
784:       y[idy+1]  += x[1];
785:       y[idy+2]  += x[2];
786:       y[idy+3]  += x[3];
787:       y[idy+4]  += x[4];
788:       x    += 5;
789:     }
790:     break;
791: #if !defined(PETSC_USE_COMPLEX)
792:   case MAX_VALUES:
793:     for (i=0; i<n; i++) {
794:       idy       = *indicesy++;
795:       y[idy]    = PetscMax(y[idy],x[0]);
796:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
797:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
798:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
799:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
800:       x    += 5;
801:     }
802: #else
803:   case MAX_VALUES:
804: #endif
805:   case NOT_SET_VALUES:
806:     break;
807:   }
808: }

810: PETSC_STATIC_INLINE void Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
811: {
812:   PetscInt i,idx,idy;

814:   switch (addv) {
815:   case INSERT_VALUES:
816:     for (i=0; i<n; i++) {
817:       idx       = *indicesx++;
818:       idy       = *indicesy++;
819:       y[idy]    = x[idx];
820:       y[idy+1]  = x[idx+1];
821:       y[idy+2]  = x[idx+2];
822:       y[idy+3]  = x[idx+3];
823:       y[idy+4]  = x[idx+4];
824:     }
825:     break;
826:   case ADD_VALUES:
827:     for (i=0; i<n; i++) {
828:       idx       = *indicesx++;
829:       idy       = *indicesy++;
830:       y[idy]    += x[idx];
831:       y[idy+1]  += x[idx+1];
832:       y[idy+2]  += x[idx+2];
833:       y[idy+3]  += x[idx+3];
834:       y[idy+4]  += x[idx+4];
835:     }
836:     break;
837: #if !defined(PETSC_USE_COMPLEX)
838:   case MAX_VALUES:
839:     for (i=0; i<n; i++) {
840:       idx       = *indicesx++;
841:       idy       = *indicesy++;
842:       y[idy]    = PetscMax(y[idy],x[idx]);
843:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
844:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
845:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
846:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
847:     }
848: #else
849:   case MAX_VALUES:
850: #endif
851:   case NOT_SET_VALUES:
852:     break;
853:   }
854: }
855:   /* ----------------------------------------------------------------------------------------------- */
856: PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
857: {
858:   PetscInt i,idx;

860:   for (i=0; i<n; i++) {
861:     idx   = *indicesx++;
862:     y[0]  = x[idx];
863:     y[1]  = x[idx+1];
864:     y[2]  = x[idx+2];
865:     y[3]  = x[idx+3];
866:     y[4]  = x[idx+4];
867:     y[5]  = x[idx+5];
868:     y    += 6;
869:   }
870: }
871: PETSC_STATIC_INLINE void UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
872: {
873:   PetscInt i,idy;

875:   switch (addv) {
876:   case INSERT_VALUES:
877:     for (i=0; i<n; i++) {
878:       idy       = *indicesy++;
879:       y[idy]    = x[0];
880:       y[idy+1]  = x[1];
881:       y[idy+2]  = x[2];
882:       y[idy+3]  = x[3];
883:       y[idy+4]  = x[4];
884:       y[idy+5]  = x[5];
885:       x    += 6;
886:     }
887:     break;
888:   case ADD_VALUES:
889:     for (i=0; i<n; i++) {
890:       idy       = *indicesy++;
891:       y[idy]    += x[0];
892:       y[idy+1]  += x[1];
893:       y[idy+2]  += x[2];
894:       y[idy+3]  += x[3];
895:       y[idy+4]  += x[4];
896:       y[idy+5]  += x[5];
897:       x    += 6;
898:     }
899:     break;
900: #if !defined(PETSC_USE_COMPLEX)
901:   case MAX_VALUES:
902:     for (i=0; i<n; i++) {
903:       idy       = *indicesy++;
904:       y[idy]    = PetscMax(y[idy],x[0]);
905:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
906:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
907:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
908:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
909:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
910:       x    += 6;
911:     }
912: #else
913:   case MAX_VALUES:
914: #endif
915:   case NOT_SET_VALUES:
916:     break;
917:   }
918: }

920: PETSC_STATIC_INLINE void Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
921: {
922:   PetscInt i,idx,idy;

924:   switch (addv) {
925:   case INSERT_VALUES:
926:     for (i=0; i<n; i++) {
927:       idx       = *indicesx++;
928:       idy       = *indicesy++;
929:       y[idy]    = x[idx];
930:       y[idy+1]  = x[idx+1];
931:       y[idy+2]  = x[idx+2];
932:       y[idy+3]  = x[idx+3];
933:       y[idy+4]  = x[idx+4];
934:       y[idy+5]  = x[idx+5];
935:     }
936:     break;
937:   case ADD_VALUES:
938:     for (i=0; i<n; i++) {
939:       idx       = *indicesx++;
940:       idy       = *indicesy++;
941:       y[idy]    += x[idx];
942:       y[idy+1]  += x[idx+1];
943:       y[idy+2]  += x[idx+2];
944:       y[idy+3]  += x[idx+3];
945:       y[idy+4]  += x[idx+4];
946:       y[idy+5]  += x[idx+5];
947:     }
948:     break;
949: #if !defined(PETSC_USE_COMPLEX)
950:   case MAX_VALUES:
951:     for (i=0; i<n; i++) {
952:       idx       = *indicesx++;
953:       idy       = *indicesy++;
954:       y[idy]    = PetscMax(y[idy],x[idx]);
955:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
956:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
957:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
958:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
959:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
960:     }
961: #else
962:   case MAX_VALUES:
963: #endif
964:   case NOT_SET_VALUES:
965:     break;
966:   }
967: }
968:   /* ----------------------------------------------------------------------------------------------- */
969: PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
970: {
971:   PetscInt i,idx;

973:   for (i=0; i<n; i++) {
974:     idx   = *indicesx++;
975:     y[0]  = x[idx];
976:     y[1]  = x[idx+1];
977:     y[2]  = x[idx+2];
978:     y[3]  = x[idx+3];
979:     y[4]  = x[idx+4];
980:     y[5]  = x[idx+5];
981:     y[6]  = x[idx+6];
982:     y    += 7;
983:   }
984: }
985: PETSC_STATIC_INLINE void UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
986: {
987:   PetscInt i,idy;

989:   switch (addv) {
990:   case INSERT_VALUES:
991:     for (i=0; i<n; i++) {
992:       idy       = *indicesy++;
993:       y[idy]    = x[0];
994:       y[idy+1]  = x[1];
995:       y[idy+2]  = x[2];
996:       y[idy+3]  = x[3];
997:       y[idy+4]  = x[4];
998:       y[idy+5]  = x[5];
999:       y[idy+6]  = x[6];
1000:       x    += 7;
1001:     }
1002:     break;
1003:   case ADD_VALUES:
1004:     for (i=0; i<n; i++) {
1005:       idy       = *indicesy++;
1006:       y[idy]    += x[0];
1007:       y[idy+1]  += x[1];
1008:       y[idy+2]  += x[2];
1009:       y[idy+3]  += x[3];
1010:       y[idy+4]  += x[4];
1011:       y[idy+5]  += x[5];
1012:       y[idy+6]  += x[6];
1013:       x    += 7;
1014:     }
1015:     break;
1016: #if !defined(PETSC_USE_COMPLEX)
1017:   case MAX_VALUES:
1018:     for (i=0; i<n; i++) {
1019:       idy       = *indicesy++;
1020:       y[idy]    = PetscMax(y[idy],x[0]);
1021:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1022:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1023:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1024:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1025:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1026:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1027:       x    += 7;
1028:     }
1029: #else
1030:   case MAX_VALUES:
1031: #endif
1032:   case NOT_SET_VALUES:
1033:     break;
1034:   }
1035: }

1037: PETSC_STATIC_INLINE void Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1038: {
1039:   PetscInt i,idx,idy;

1041:   switch (addv) {
1042:   case INSERT_VALUES:
1043:     for (i=0; i<n; i++) {
1044:       idx       = *indicesx++;
1045:       idy       = *indicesy++;
1046:       y[idy]    = x[idx];
1047:       y[idy+1]  = x[idx+1];
1048:       y[idy+2]  = x[idx+2];
1049:       y[idy+3]  = x[idx+3];
1050:       y[idy+4]  = x[idx+4];
1051:       y[idy+5]  = x[idx+5];
1052:       y[idy+6]  = x[idx+6];
1053:     }
1054:     break;
1055:   case ADD_VALUES:
1056:     for (i=0; i<n; i++) {
1057:       idx       = *indicesx++;
1058:       idy       = *indicesy++;
1059:       y[idy]    += x[idx];
1060:       y[idy+1]  += x[idx+1];
1061:       y[idy+2]  += x[idx+2];
1062:       y[idy+3]  += x[idx+3];
1063:       y[idy+4]  += x[idx+4];
1064:       y[idy+5]  += x[idx+5];
1065:       y[idy+6]  += x[idx+6];
1066:     }
1067:     break;
1068: #if !defined(PETSC_USE_COMPLEX)
1069:   case MAX_VALUES:
1070:     for (i=0; i<n; i++) {
1071:       idx       = *indicesx++;
1072:       idy       = *indicesy++;
1073:       y[idy]    = PetscMax(y[idy],x[idx]);
1074:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1075:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1076:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1077:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1078:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1079:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1080:     }
1081: #else
1082:   case MAX_VALUES:
1083: #endif
1084:   case NOT_SET_VALUES:
1085:     break;
1086:   }
1087: }
1088:   /* ----------------------------------------------------------------------------------------------- */
1089: PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1090: {
1091:   PetscInt i,idx;

1093:   for (i=0; i<n; i++) {
1094:     idx   = *indicesx++;
1095:     y[0]  = x[idx];
1096:     y[1]  = x[idx+1];
1097:     y[2]  = x[idx+2];
1098:     y[3]  = x[idx+3];
1099:     y[4]  = x[idx+4];
1100:     y[5]  = x[idx+5];
1101:     y[6]  = x[idx+6];
1102:     y[7]  = x[idx+7];
1103:     y    += 8;
1104:   }
1105: }
1106: PETSC_STATIC_INLINE void UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1107: {
1108:   PetscInt i,idy;

1110:   switch (addv) {
1111:   case INSERT_VALUES:
1112:     for (i=0; i<n; i++) {
1113:       idy       = *indicesy++;
1114:       y[idy]    = x[0];
1115:       y[idy+1]  = x[1];
1116:       y[idy+2]  = x[2];
1117:       y[idy+3]  = x[3];
1118:       y[idy+4]  = x[4];
1119:       y[idy+5]  = x[5];
1120:       y[idy+6]  = x[6];
1121:       y[idy+7]  = x[7];
1122:       x    += 8;
1123:     }
1124:     break;
1125:   case ADD_VALUES:
1126:     for (i=0; i<n; i++) {
1127:       idy       = *indicesy++;
1128:       y[idy]    += x[0];
1129:       y[idy+1]  += x[1];
1130:       y[idy+2]  += x[2];
1131:       y[idy+3]  += x[3];
1132:       y[idy+4]  += x[4];
1133:       y[idy+5]  += x[5];
1134:       y[idy+6]  += x[6];
1135:       y[idy+7]  += x[7];
1136:       x    += 8;
1137:     }
1138:     break;
1139: #if !defined(PETSC_USE_COMPLEX)
1140:   case MAX_VALUES:
1141:     for (i=0; i<n; i++) {
1142:       idy       = *indicesy++;
1143:       y[idy]    = PetscMax(y[idy],x[0]);
1144:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1145:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1146:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1147:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1148:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1149:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1150:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1151:       x    += 8;
1152:     }
1153: #else
1154:   case MAX_VALUES:
1155: #endif
1156:   case NOT_SET_VALUES:
1157:     break;
1158:   }
1159: }

1161: PETSC_STATIC_INLINE void Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1162: {
1163:   PetscInt i,idx,idy;

1165:   switch (addv) {
1166:   case INSERT_VALUES:
1167:     for (i=0; i<n; i++) {
1168:       idx       = *indicesx++;
1169:       idy       = *indicesy++;
1170:       y[idy]    = x[idx];
1171:       y[idy+1]  = x[idx+1];
1172:       y[idy+2]  = x[idx+2];
1173:       y[idy+3]  = x[idx+3];
1174:       y[idy+4]  = x[idx+4];
1175:       y[idy+5]  = x[idx+5];
1176:       y[idy+6]  = x[idx+6];
1177:       y[idy+7]  = x[idx+7];
1178:     }
1179:     break;
1180:   case ADD_VALUES:
1181:     for (i=0; i<n; i++) {
1182:       idx       = *indicesx++;
1183:       idy       = *indicesy++;
1184:       y[idy]    += x[idx];
1185:       y[idy+1]  += x[idx+1];
1186:       y[idy+2]  += x[idx+2];
1187:       y[idy+3]  += x[idx+3];
1188:       y[idy+4]  += x[idx+4];
1189:       y[idy+5]  += x[idx+5];
1190:       y[idy+6]  += x[idx+6];
1191:       y[idy+7]  += x[idx+7];
1192:     }
1193:     break;
1194: #if !defined(PETSC_USE_COMPLEX)
1195:   case MAX_VALUES:
1196:     for (i=0; i<n; i++) {
1197:       idx       = *indicesx++;
1198:       idy       = *indicesy++;
1199:       y[idy]    = PetscMax(y[idy],x[idx]);
1200:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1201:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1202:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1203:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1204:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1205:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1206:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1207:     }
1208: #else
1209:   case MAX_VALUES:
1210: #endif
1211:   case NOT_SET_VALUES:
1212:     break;
1213:   }
1214: }

1216:   /* ----------------------------------------------------------------------------------------------- */
1217: PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y)
1218: {
1219:   PetscInt i,idx;

1221:   for (i=0; i<n; i++) {
1222:     idx   = *indicesx++;
1223:     y[0]  = x[idx];
1224:     y[1]  = x[idx+1];
1225:     y[2]  = x[idx+2];
1226:     y[3]  = x[idx+3];
1227:     y[4]  = x[idx+4];
1228:     y[5]  = x[idx+5];
1229:     y[6]  = x[idx+6];
1230:     y[7]  = x[idx+7];
1231:     y[8]  = x[idx+8];
1232:     y[9]  = x[idx+9];
1233:     y[10] = x[idx+10];
1234:     y[11] = x[idx+11];
1235:     y    += 12;
1236:   }
1237: }
1238: PETSC_STATIC_INLINE void UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1239: {
1240:   PetscInt i,idy;

1242:   switch (addv) {
1243:   case INSERT_VALUES:
1244:     for (i=0; i<n; i++) {
1245:       idy       = *indicesy++;
1246:       y[idy]    = x[0];
1247:       y[idy+1]  = x[1];
1248:       y[idy+2]  = x[2];
1249:       y[idy+3]  = x[3];
1250:       y[idy+4]  = x[4];
1251:       y[idy+5]  = x[5];
1252:       y[idy+6]  = x[6];
1253:       y[idy+7]  = x[7];
1254:       y[idy+8]  = x[8];
1255:       y[idy+9]  = x[9];
1256:       y[idy+10] = x[10];
1257:       y[idy+11] = x[11];
1258:       x    += 12;
1259:     }
1260:     break;
1261:   case ADD_VALUES:
1262:     for (i=0; i<n; i++) {
1263:       idy       = *indicesy++;
1264:       y[idy]    += x[0];
1265:       y[idy+1]  += x[1];
1266:       y[idy+2]  += x[2];
1267:       y[idy+3]  += x[3];
1268:       y[idy+4]  += x[4];
1269:       y[idy+5]  += x[5];
1270:       y[idy+6]  += x[6];
1271:       y[idy+7]  += x[7];
1272:       y[idy+8]  += x[8];
1273:       y[idy+9]  += x[9];
1274:       y[idy+10] += x[10];
1275:       y[idy+11] += x[11];
1276:       x    += 12;
1277:     }
1278:     break;
1279: #if !defined(PETSC_USE_COMPLEX)
1280:   case MAX_VALUES:
1281:     for (i=0; i<n; i++) {
1282:       idy       = *indicesy++;
1283:       y[idy]    = PetscMax(y[idy],x[0]);
1284:       y[idy+1]  = PetscMax(y[idy+1],x[1]);
1285:       y[idy+2]  = PetscMax(y[idy+2],x[2]);
1286:       y[idy+3]  = PetscMax(y[idy+3],x[3]);
1287:       y[idy+4]  = PetscMax(y[idy+4],x[4]);
1288:       y[idy+5]  = PetscMax(y[idy+5],x[5]);
1289:       y[idy+6]  = PetscMax(y[idy+6],x[6]);
1290:       y[idy+7]  = PetscMax(y[idy+7],x[7]);
1291:       y[idy+8]  = PetscMax(y[idy+8],x[8]);
1292:       y[idy+9]  = PetscMax(y[idy+9],x[9]);
1293:       y[idy+10] = PetscMax(y[idy+10],x[10]);
1294:       y[idy+11] = PetscMax(y[idy+11],x[11]);
1295:       x    += 12;
1296:     }
1297: #else
1298:   case MAX_VALUES:
1299: #endif
1300:   case NOT_SET_VALUES:
1301:     break;
1302:   }
1303: }

1305: PETSC_STATIC_INLINE void Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv)
1306: {
1307:   PetscInt i,idx,idy;

1309:   switch (addv) {
1310:   case INSERT_VALUES:
1311:     for (i=0; i<n; i++) {
1312:       idx       = *indicesx++;
1313:       idy       = *indicesy++;
1314:       y[idy]    = x[idx];
1315:       y[idy+1]  = x[idx+1];
1316:       y[idy+2]  = x[idx+2];
1317:       y[idy+3]  = x[idx+3];
1318:       y[idy+4]  = x[idx+4];
1319:       y[idy+5]  = x[idx+5];
1320:       y[idy+6]  = x[idx+6];
1321:       y[idy+7]  = x[idx+7];
1322:       y[idy+8]  = x[idx+8];
1323:       y[idy+9]  = x[idx+9];
1324:       y[idy+10] = x[idx+10];
1325:       y[idy+11] = x[idx+11];
1326:     }
1327:     break;
1328:   case ADD_VALUES:
1329:     for (i=0; i<n; i++) {
1330:       idx       = *indicesx++;
1331:       idy       = *indicesy++;
1332:       y[idy]    += x[idx];
1333:       y[idy+1]  += x[idx+1];
1334:       y[idy+2]  += x[idx+2];
1335:       y[idy+3]  += x[idx+3];
1336:       y[idy+4]  += x[idx+4];
1337:       y[idy+5]  += x[idx+5];
1338:       y[idy+6]  += x[idx+6];
1339:       y[idy+7]  += x[idx+7];
1340:       y[idy+8]  += x[idx+8];
1341:       y[idy+9]  += x[idx+9];
1342:       y[idy+10] += x[idx+10];
1343:       y[idy+11] += x[idx+11];
1344:     }
1345:     break;
1346: #if !defined(PETSC_USE_COMPLEX)
1347:   case MAX_VALUES:
1348:     for (i=0; i<n; i++) {
1349:       idx       = *indicesx++;
1350:       idy       = *indicesy++;
1351:       y[idy]    = PetscMax(y[idy],x[idx]);
1352:       y[idy+1]  = PetscMax(y[idy+1],x[idx+1]);
1353:       y[idy+2]  = PetscMax(y[idy+2],x[idx+2]);
1354:       y[idy+3]  = PetscMax(y[idy+3],x[idx+3]);
1355:       y[idy+4]  = PetscMax(y[idy+4],x[idx+4]);
1356:       y[idy+5]  = PetscMax(y[idy+5],x[idx+5]);
1357:       y[idy+6]  = PetscMax(y[idy+6],x[idx+6]);
1358:       y[idy+7]  = PetscMax(y[idy+7],x[idx+7]);
1359:       y[idy+8]  = PetscMax(y[idy+8],x[idx+8]);
1360:       y[idy+9]  = PetscMax(y[idy+9],x[idx+9]);
1361:       y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1362:       y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1363:     }
1364: #else
1365:   case MAX_VALUES:
1366: #endif
1367:   case NOT_SET_VALUES:
1368:     break;
1369:   }
1370: }

1372: /* Create the VecScatterBegin/End_P for our chosen block sizes */
1373: #define BS 1
1374:  #include src/vec/vec/utils/vpscat.h
1375: #define BS 2
1376:  #include src/vec/vec/utils/vpscat.h
1377: #define BS 3
1378:  #include src/vec/vec/utils/vpscat.h
1379: #define BS 4
1380:  #include src/vec/vec/utils/vpscat.h
1381: #define BS 5
1382:  #include src/vec/vec/utils/vpscat.h
1383: #define BS 6
1384:  #include src/vec/vec/utils/vpscat.h
1385: #define BS 7
1386:  #include src/vec/vec/utils/vpscat.h
1387: #define BS 8
1388:  #include src/vec/vec/utils/vpscat.h
1389: #define BS 12
1390:  #include src/vec/vec/utils/vpscat.h

1392: /* ==========================================================================================*/

1394: /*              create parallel to sequential scatter context                           */

1396: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *,VecScatter_MPI_General *,VecScatter);

1400: /*@
1401:      VecScatterCreateLocal - Creates a VecScatter from a list of messages it must send a receive.

1403:      Collective on VecScatter

1405:    Input Parameters:
1406: +     VecScatter - obtained with VecScatterCreateEmpty()
1407: .     nsends -
1408: .     sendSizes -
1409: .     sendProcs -
1410: .     sendIdx - indices where the sent entries are obtained from (in local, on process numbering), this is one long array of size \sum_{i=0,i<nsends} sendSizes[i]
1411: .     nrecvs - number of receives to expect
1412: .     recvSizes - 
1413: .     recvProcs - processes who are sending to me
1414: .     recvIdx - indices of where received entries are to be put, (in local, on process numbering), this is one long array of size \sum_{i=0,i<nrecvs} recvSizes[i]
1415: -

1417:      Notes:  sendSizes[] and recvSizes[] cannot have any 0 entries. If you want to support having 0 entries you need
1418:       to change the code below to "compress out" the sendProcs[] and recvProcs[] entries that have 0 entries.

1420:        Probably does not handle sends to self properly. It should remove those from the counts that are used
1421:       in allocating space inside of the from struct

1423: @*/
1424: PetscErrorCode VecScatterCreateLocal(VecScatter ctx,PetscInt nsends,const PetscInt sendSizes[],const PetscInt sendProcs[],const PetscInt sendIdx[],PetscInt nrecvs,const PetscInt recvSizes[],const PetscInt recvProcs[],const PetscInt recvIdx[],PetscInt bs)
1425: {
1426:   VecScatter_MPI_General *from, *to;
1427:   PetscInt               sendSize, recvSize;
1428:   PetscInt               n, i;
1429:   PetscErrorCode         ierr;

1431:   /* allocate entire send scatter context */
1432:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1433:   to->n = nsends;
1434:   for(n = 0, sendSize = 0; n < to->n; n++) {sendSize += sendSizes[n];}
1435:   PetscMalloc(to->n*sizeof(MPI_Request),&to->requests);
1436:   PetscMalloc4(bs*sendSize,PetscScalar,&to->values,sendSize,PetscInt,&to->indices,to->n+1,PetscInt,&to->starts,to->n,PetscMPIInt,&to->procs);
1437:   PetscMalloc2(PetscMax(to->n,nrecvs),MPI_Status,&to->sstatus,PetscMax(to->n,nrecvs),MPI_Status,&to->rstatus);
1438:   to->starts[0] = 0;
1439:   for(n = 0; n < to->n; n++) {
1440:     if (sendSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"sendSizes[n=%D] = %D cannot be less than 1",n,sendSizes[n]);
1441:     to->starts[n+1] = to->starts[n] + sendSizes[n];
1442:     to->procs[n] = sendProcs[n];
1443:     for(i = to->starts[n]; i < to->starts[n]+sendSizes[n]; i++) {
1444:       to->indices[i] = sendIdx[i];
1445:     }
1446:   }
1447:   ctx->todata = (void *) to;

1449:   /* allocate entire receive scatter context */
1450:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1451:   from->n = nrecvs;
1452:   for(n = 0, recvSize = 0; n < from->n; n++) {recvSize += recvSizes[n];}
1453:   PetscMalloc(from->n*sizeof(MPI_Request),&from->requests);
1454:   PetscMalloc4(bs*recvSize,PetscScalar,&from->values,recvSize,PetscInt,&from->indices,from->n+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1455:   from->starts[0] = 0;
1456:   for(n = 0; n < from->n; n++) {
1457:     if (recvSizes[n] <=0 ) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE,"recvSizes[n=%D] = %D cannot be less than 1",n,recvSizes[n]);
1458:     from->starts[n+1] = from->starts[n] + recvSizes[n];
1459:     from->procs[n] = recvProcs[n];
1460:     for(i = from->starts[n]; i < from->starts[n]+recvSizes[n]; i++) {
1461:       from->indices[i] = recvIdx[i];
1462:     }
1463:   }
1464:   ctx->fromdata = (void *)from;

1466:   /* No local scatter optimization */
1467:   from->local.n      = 0;
1468:   from->local.vslots = 0;
1469:   to->local.n        = 0;
1470:   to->local.vslots   = 0;
1471:   from->local.nonmatching_computed = PETSC_FALSE;
1472:   from->local.n_nonmatching        = 0;
1473:   from->local.slots_nonmatching    = 0;
1474:   to->local.nonmatching_computed   = PETSC_FALSE;
1475:   to->local.n_nonmatching          = 0;
1476:   to->local.slots_nonmatching      = 0;

1478:   from->type = VEC_SCATTER_MPI_GENERAL;
1479:   to->type   = VEC_SCATTER_MPI_GENERAL;
1480:   from->bs = bs;
1481:   to->bs   = bs;

1483:   VecScatterCreateCommon_PtoS(from, to, ctx);
1484:   return(0);
1485: }

1487: /*
1488:    bs indicates how many elements there are in each block. Normally this would be 1.
1489: */
1492: PetscErrorCode VecScatterCreate_PtoS(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1493: {
1494:   VecScatter_MPI_General *from,*to;
1495:   PetscMPIInt            size,rank,imdex,tag,n;
1496:   PetscInt               *source = PETSC_NULL,*lens = PETSC_NULL,*owners = PETSC_NULL;
1497:   PetscInt               *lowner = PETSC_NULL,*start = PETSC_NULL,lengthy,lengthx;
1498:   PetscInt               *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs;
1499:   PetscInt               *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1500:   PetscInt               *rvalues,*svalues,base,nmax,*values,*indx,nprocslocal;
1501:   MPI_Comm               comm;
1502:   MPI_Request            *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1503:   MPI_Status             recv_status,*send_status;
1504:   PetscErrorCode         ierr;

1507:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
1508:   PetscObjectGetComm((PetscObject)xin,&comm);
1509:   MPI_Comm_rank(comm,&rank);
1510:   MPI_Comm_size(comm,&size);
1511:   owners = xin->map.range;
1512:   VecGetSize(yin,&lengthy);
1513:   VecGetSize(xin,&lengthx);

1515:   /*  first count number of contributors to each processor */
1516:   PetscMalloc2(2*size,PetscInt,&nprocs,nx,PetscInt,&owner);
1517:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
1518:   j      = 0;
1519:   nsends = 0;
1520:   for (i=0; i<nx; i++) {
1521:     idx = inidx[i];
1522:     if (idx < owners[j]) j = 0;
1523:     for (; j<size; j++) {
1524:       if (idx < owners[j+1]) {
1525:         if (!nprocs[2*j]++) nsends++;
1526:         nprocs[2*j+1] = 1;
1527:         owner[i] = j;
1528:         break;
1529:       }
1530:     }
1531:   }
1532:   nprocslocal    = nprocs[2*rank];
1533:   nprocs[2*rank] = nprocs[2*rank+1] = 0;
1534:   if (nprocslocal) nsends--;

1536:   /* inform other processors of number of messages and max length*/
1537:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

1539:   /* post receives:   */
1540:   PetscMalloc4(nrecvs*nmax,PetscInt,&rvalues,nrecvs,PetscInt,&lens,nrecvs,PetscInt,&source,nrecvs,MPI_Request,&recv_waits);
1541:   for (i=0; i<nrecvs; i++) {
1542:     MPI_Irecv((rvalues+nmax*i),nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
1543:   }

1545:   /* do sends:
1546:      1) starts[i] gives the starting index in svalues for stuff going to 
1547:      the ith processor
1548:   */
1549:   PetscMalloc3(nx,PetscInt,&svalues,nsends,MPI_Request,&send_waits,size+1,PetscInt,&starts);
1550:   starts[0]  = 0;
1551:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1552:   for (i=0; i<nx; i++) {
1553:     if (owner[i] != rank) {
1554:       svalues[starts[owner[i]]++] = inidx[i];
1555:     }
1556:   }

1558:   starts[0] = 0;
1559:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
1560:   count = 0;
1561:   for (i=0; i<size; i++) {
1562:     if (nprocs[2*i+1]) {
1563:       MPI_Isend(svalues+starts[i],nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count++);
1564:     }
1565:   }

1567:   /*  wait on receives */
1568:   count  = nrecvs;
1569:   slen   = 0;
1570:   while (count) {
1571:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
1572:     /* unpack receives into our local space */
1573:     MPI_Get_count(&recv_status,MPIU_INT,&n);
1574:     source[imdex]  = recv_status.MPI_SOURCE;
1575:     lens[imdex]    = n;
1576:     slen          += n;
1577:     count--;
1578:   }
1579: 
1580:   /* allocate entire send scatter context */
1581:   PetscNewLog(ctx,VecScatter_MPI_General,&to);
1582:   to->n = nrecvs;
1583:   PetscMalloc(nrecvs*sizeof(MPI_Request),&to->requests);
1584:   PetscMalloc4(bs*slen,PetscScalar,&to->values,slen,PetscInt,&to->indices,nrecvs+1,PetscInt,&to->starts,
1585:                        nrecvs,PetscMPIInt,&to->procs);
1586:   PetscMalloc2(PetscMax(to->n,nsends),MPI_Status,&to->sstatus,PetscMax(to->n,nsends),MPI_Status,
1587:                        &to->rstatus);
1588:   ctx->todata   = (void*)to;
1589:   to->starts[0] = 0;

1591:   if (nrecvs) {
1592:     PetscMalloc(nrecvs*sizeof(PetscInt),&indx);
1593:     for (i=0; i<nrecvs; i++) indx[i] = i;
1594:     PetscSortIntWithPermutation(nrecvs,source,indx);

1596:     /* move the data into the send scatter */
1597:     base = owners[rank];
1598:     for (i=0; i<nrecvs; i++) {
1599:       to->starts[i+1] = to->starts[i] + lens[indx[i]];
1600:       to->procs[i]    = source[indx[i]];
1601:       values = rvalues + indx[i]*nmax;
1602:       for (j=0; j<lens[indx[i]]; j++) {
1603:         to->indices[to->starts[i] + j] = values[j] - base;
1604:       }
1605:     }
1606:     PetscFree(indx);
1607:   }
1608:   PetscFree4(rvalues,lens,source,recv_waits);
1609: 
1610:   /* allocate entire receive scatter context */
1611:   PetscNewLog(ctx,VecScatter_MPI_General,&from);
1612:   from->n = nsends;

1614:   PetscMalloc(nsends*sizeof(MPI_Request),&from->requests);
1615:   PetscMalloc4((ny-nprocslocal)*bs,PetscScalar,&from->values,ny-nprocslocal,PetscInt,&from->indices,
1616:                       nsends+1,PetscInt,&from->starts,from->n,PetscMPIInt,&from->procs);
1617:   ctx->fromdata  = (void*)from;

1619:   /* move data into receive scatter */
1620:   PetscMalloc2(size,PetscInt,&lowner,nsends+1,PetscInt,&start);
1621:   count = 0; from->starts[0] = start[0] = 0;
1622:   for (i=0; i<size; i++) {
1623:     if (nprocs[2*i+1]) {
1624:       lowner[i]            = count;
1625:       from->procs[count++] = i;
1626:       from->starts[count]  = start[count] = start[count-1] + nprocs[2*i];
1627:     }
1628:   }
1629:   for (i=0; i<nx; i++) {
1630:     if (owner[i] != rank) {
1631:       from->indices[start[lowner[owner[i]]]++] = inidy[i];
1632:       if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1633:     }
1634:   }
1635:   PetscFree2(lowner,start);
1636:   PetscFree2(nprocs,owner);
1637: 
1638:   /* wait on sends */
1639:   if (nsends) {
1640:     PetscMalloc(nsends*sizeof(MPI_Status),&send_status);
1641:     MPI_Waitall(nsends,send_waits,send_status);
1642:     PetscFree(send_status);
1643:   }
1644:   PetscFree3(svalues,send_waits,starts);

1646:   if (nprocslocal) {
1647:     PetscInt nt = from->local.n = to->local.n = nprocslocal;
1648:     /* we have a scatter to ourselves */
1649:     PetscMalloc(nt*sizeof(PetscInt),&to->local.vslots);
1650:     PetscMalloc(nt*sizeof(PetscInt),&from->local.vslots);
1651:     nt   = 0;
1652:     for (i=0; i<nx; i++) {
1653:       idx = inidx[i];
1654:       if (idx >= owners[rank] && idx < owners[rank+1]) {
1655:         to->local.vslots[nt]     = idx - owners[rank];
1656:         from->local.vslots[nt++] = inidy[i];
1657:         if (inidy[i] >= lengthy) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
1658:       }
1659:     }
1660:   } else {
1661:     from->local.n      = 0;
1662:     from->local.vslots = 0;
1663:     to->local.n        = 0;
1664:     to->local.vslots   = 0;
1665:   }
1666:   from->local.nonmatching_computed = PETSC_FALSE;
1667:   from->local.n_nonmatching        = 0;
1668:   from->local.slots_nonmatching    = 0;
1669:   to->local.nonmatching_computed   = PETSC_FALSE;
1670:   to->local.n_nonmatching          = 0;
1671:   to->local.slots_nonmatching      = 0;

1673:   from->type = VEC_SCATTER_MPI_GENERAL;
1674:   to->type   = VEC_SCATTER_MPI_GENERAL;
1675:   from->bs = bs;
1676:   to->bs   = bs;

1678:   VecScatterCreateCommon_PtoS(from,to,ctx);
1679:   return(0);
1680: }

1682: /*
1683:    bs indicates how many elements there are in each block. Normally this would be 1.
1684: */
1687: PetscErrorCode VecScatterCreateCommon_PtoS(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
1688: {
1689:   MPI_Comm       comm = ((PetscObject)ctx)->comm;
1690:   PetscMPIInt    tag  = ((PetscObject)ctx)->tag, tagr;
1691:   PetscInt       bs   = to->bs;
1692:   PetscMPIInt    size;
1693:   PetscInt       i, n;
1695: 
1697:   PetscObjectGetNewTag((PetscObject)ctx,&tagr);
1698:   ctx->destroy = VecScatterDestroy_PtoP;

1700:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_sendfirst",&to->sendfirst);
1701:   from->sendfirst = to->sendfirst;

1703:   MPI_Comm_size(comm,&size);
1704:   /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
1705:   to->contiq = PETSC_FALSE;
1706:   n = from->starts[from->n];
1707:   from->contiq = PETSC_TRUE;
1708:   for (i=1; i<n; i++) {
1709:     if (from->indices[i] != from->indices[i-1] + bs) {
1710:       from->contiq = PETSC_FALSE;
1711:       break;
1712:     }
1713:   }

1715:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_alltoall",&to->use_alltoallv);
1716:   from->use_alltoallv = to->use_alltoallv;
1717:   if (from->use_alltoallv) PetscInfo(ctx,"Using MPI_Alltoallv() for scatter\n");
1718: #if defined(PETSC_HAVE_MPI_ALLTOALLW) 
1719:   if (to->use_alltoallv) {
1720:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_nopack",&to->use_alltoallw);
1721:   }
1722:   from->use_alltoallw = to->use_alltoallw;
1723:   if (from->use_alltoallw) PetscInfo(ctx,"Using MPI_Alltoallw() for scatter\n");
1724: #endif

1726: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1727:   PetscOptionsHasName(PETSC_NULL,"-vecscatter_window",&to->use_window);
1728:   from->use_window = to->use_window;
1729: #endif

1731:   if (to->use_alltoallv) {

1733:     PetscMalloc2(size,PetscMPIInt,&to->counts,size,PetscMPIInt,&to->displs);
1734:     PetscMemzero(to->counts,size*sizeof(PetscMPIInt));
1735:     for (i=0; i<to->n; i++) {
1736:       to->counts[to->procs[i]] = bs*(to->starts[i+1] - to->starts[i]);
1737:     }
1738:     to->displs[0] = 0;
1739:     for (i=1; i<size; i++) {
1740:       to->displs[i] = to->displs[i-1] + to->counts[i-1];
1741:     }

1743:     PetscMalloc2(size,PetscMPIInt,&from->counts,size,PetscMPIInt,&from->displs);
1744:     PetscMemzero(from->counts,size*sizeof(PetscMPIInt));
1745:     for (i=0; i<from->n; i++) {
1746:       from->counts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1747:     }
1748:     from->displs[0] = 0;
1749:     for (i=1; i<size; i++) {
1750:       from->displs[i] = from->displs[i-1] + from->counts[i-1];
1751:     }
1752: #if defined(PETSC_HAVE_MPI_ALLTOALLW) 
1753:     if (to->use_alltoallw) {
1754:       PetscMPIInt mpibs = (PetscMPIInt)bs, mpilen;
1755:       ctx->packtogether = PETSC_FALSE;
1756:       PetscMalloc3(size,PetscMPIInt,&to->wcounts,size,PetscMPIInt,&to->wdispls,size,MPI_Datatype,&to->types);
1757:       PetscMemzero(to->wcounts,size*sizeof(PetscMPIInt));
1758:       PetscMemzero(to->wdispls,size*sizeof(PetscMPIInt));
1759:       for (i=0; i<size; i++) {
1760:         to->types[i] = MPIU_SCALAR;
1761:       }

1763:       for (i=0; i<to->n; i++) {
1764:         to->wcounts[to->procs[i]] = 1;
1765:         mpilen = (PetscMPIInt) to->starts[i+1]-to->starts[i];
1766:         MPI_Type_create_indexed_block(mpilen,mpibs,to->indices+to->starts[i],MPIU_SCALAR,to->types+to->procs[i]);
1767:         MPI_Type_commit(to->types+to->procs[i]);
1768:       }
1769:       PetscMalloc3(size,PetscMPIInt,&from->wcounts,size,PetscMPIInt,&from->wdispls,size,MPI_Datatype,&from->types);
1770:       PetscMemzero(from->wcounts,size*sizeof(PetscMPIInt));
1771:       PetscMemzero(from->wdispls,size*sizeof(PetscMPIInt));
1772:       for (i=0; i<size; i++) {
1773:         from->types[i] = MPIU_SCALAR;
1774:       }
1775:       if (from->contiq) {
1776:         PetscInfo(ctx,"Scattered vector entries are stored contiquously, taking advantage of this with -vecscatter_alltoall\n");
1777:         for (i=0; i<from->n; i++) {
1778:           from->wcounts[from->procs[i]] = bs*(from->starts[i+1] - from->starts[i]);
1779:         }
1780:         if (from->n) from->wdispls[from->procs[0]] = sizeof(PetscScalar)*from->indices[0];
1781:         for (i=1; i<from->n; i++) {
1782:           from->wdispls[from->procs[i]] = from->wdispls[from->procs[i-1]] + sizeof(PetscScalar)*from->wcounts[from->procs[i-1]];
1783:         }
1784:       } else {
1785:         for (i=0; i<from->n; i++) {
1786:           from->wcounts[from->procs[i]] = 1;
1787:           mpilen = (PetscMPIInt) from->starts[i+1]-from->starts[i];
1788:           MPI_Type_create_indexed_block(mpilen,mpibs,from->indices+from->starts[i],MPIU_SCALAR,from->types+from->procs[i]);
1789:           MPI_Type_commit(from->types+from->procs[i]);
1790:         }
1791:       }
1792:     }
1793: #else 
1794:     to->use_alltoallw   = PETSC_FALSE;
1795:     from->use_alltoallw = PETSC_FALSE;
1796: #endif
1797: #if defined(PETSC_HAVE_MPI_WIN_CREATE)
1798:   } else if (to->use_window) {
1799:     PetscMPIInt temptag,winsize;
1800:     MPI_Request *request;
1801:     MPI_Status  *status;
1802: 
1803:     PetscObjectGetNewTag((PetscObject)ctx,&temptag);
1804:     winsize = (to->n ? to->starts[to->n] : 0)*sizeof(PetscScalar);
1805:     MPI_Win_create(to->values ? to->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&to->window);
1806:     PetscMalloc(to->n,&to->winstarts);
1807:     PetscMalloc2(to->n,MPI_Request,&request,to->n,MPI_Status,&status);
1808:     for (i=0; i<to->n; i++) {
1809:       MPI_Irecv(to->winstarts+i,1,MPIU_INT,to->procs[i],temptag,comm,request+i);
1810:     }
1811:     for (i=0; i<from->n; i++) {
1812:       MPI_Send(from->starts+i,1,MPIU_INT,from->procs[i],temptag,comm);
1813:     }
1814:     MPI_Waitall(to->n,request,status);
1815:     PetscFree2(request,status);

1817:     winsize = (from->n ? from->starts[from->n] : 0)*sizeof(PetscScalar);
1818:     MPI_Win_create(from->values ? from->values : MPI_BOTTOM,winsize,sizeof(PetscScalar),MPI_INFO_NULL,comm,&from->window);
1819:     PetscMalloc(from->n,&from->winstarts);
1820:     PetscMalloc2(from->n,MPI_Request,&request,from->n,MPI_Status,&status);
1821:     for (i=0; i<from->n; i++) {
1822:       MPI_Irecv(from->winstarts+i,1,MPIU_INT,from->procs[i],temptag,comm,request+i);
1823:     }
1824:     for (i=0; i<to->n; i++) {
1825:       MPI_Send(to->starts+i,1,MPIU_INT,to->procs[i],temptag,comm);
1826:     }
1827:     MPI_Waitall(from->n,request,status);
1828:     PetscFree2(request,status);
1829: #endif
1830:   } else {
1831:     PetscTruth  use_rsend = PETSC_FALSE, use_ssend = PETSC_FALSE;
1832:     PetscInt    *sstarts = to->starts,  *rstarts = from->starts;
1833:     PetscMPIInt *sprocs  = to->procs,   *rprocs  = from->procs;
1834:     MPI_Request *swaits  = to->requests,*rwaits  = from->requests;
1835:     MPI_Request *rev_swaits,*rev_rwaits;
1836:     PetscScalar *Ssvalues = to->values, *Srvalues = from->values;

1838:     /* allocate additional wait variables for the "reverse" scatter */
1839:     PetscMalloc(to->n*sizeof(MPI_Request),&rev_rwaits);
1840:     PetscMalloc(from->n*sizeof(MPI_Request),&rev_swaits);
1841:     to->rev_requests   = rev_rwaits;
1842:     from->rev_requests = rev_swaits;

1844:     /* Register the receives that you will use later (sends for scatter reverse) */
1845:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_rsend",&use_rsend);
1846:     PetscOptionsHasName(PETSC_NULL,"-vecscatter_ssend",&use_ssend);
1847:     if (use_rsend) {
1848:       PetscInfo(ctx,"Using VecScatter ready receiver mode\n");
1849:       to->use_readyreceiver    = PETSC_TRUE;
1850:       from->use_readyreceiver  = PETSC_TRUE;
1851:     } else {
1852:       to->use_readyreceiver    = PETSC_FALSE;
1853:       from->use_readyreceiver  = PETSC_FALSE;
1854:     }
1855:     if (use_ssend) {
1856:       PetscInfo(ctx,"Using VecScatter Ssend mode\n");
1857:     }

1859:     for (i=0; i<from->n; i++) {
1860:       if (use_rsend) {
1861:         MPI_Rsend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1862:       } else if (use_ssend) {
1863:         MPI_Ssend_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1864:       } else {
1865:         MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);
1866:       }
1867:     }

1869:     for (i=0; i<to->n; i++) {
1870:       if (use_rsend) {
1871:         MPI_Rsend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1872:       } else if (use_ssend) {
1873:         MPI_Ssend_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1874:       } else {
1875:         MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
1876:       }
1877:     }
1878:     /* Register receives for scatter and reverse */
1879:     for (i=0; i<from->n; i++) {
1880:       MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
1881:     }
1882:     for (i=0; i<to->n; i++) {
1883:       MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);
1884:     }
1885:     if (use_rsend) {
1886:       if (to->n)   {MPI_Startall_irecv(to->starts[to->n]*to->bs,to->n,to->rev_requests);}
1887:       if (from->n) {MPI_Startall_irecv(from->starts[from->n]*from->bs,from->n,from->requests);}
1888:       MPI_Barrier(comm);
1889:     }

1891:     ctx->copy      = VecScatterCopy_PtoP_X;
1892:   }
1893:   PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);
1894:   switch (bs) {
1895:   case 12:
1896:     ctx->begin     = VecScatterBegin_12;
1897:     ctx->end       = VecScatterEnd_12;
1898:     break;
1899:   case 8:
1900:     ctx->begin     = VecScatterBegin_8;
1901:     ctx->end       = VecScatterEnd_8;
1902:     break;
1903:   case 7:
1904:     ctx->begin     = VecScatterBegin_7;
1905:     ctx->end       = VecScatterEnd_7;
1906:     break;
1907:   case 6:
1908:     ctx->begin     = VecScatterBegin_6;
1909:     ctx->end       = VecScatterEnd_6;
1910:     break;
1911:   case 5:
1912:     ctx->begin     = VecScatterBegin_5;
1913:     ctx->end       = VecScatterEnd_5;
1914:     break;
1915:   case 4:
1916:     ctx->begin     = VecScatterBegin_4;
1917:     ctx->end       = VecScatterEnd_4;
1918:     break;
1919:   case 3:
1920:     ctx->begin     = VecScatterBegin_3;
1921:     ctx->end       = VecScatterEnd_3;
1922:     break;
1923:   case 2:
1924:     ctx->begin     = VecScatterBegin_2;
1925:     ctx->end       = VecScatterEnd_2;
1926:     break;
1927:   case 1:
1928:     ctx->begin     = VecScatterBegin_1;
1929:     ctx->end       = VecScatterEnd_1;
1930:     break;
1931:   default:
1932:     SETERRQ(PETSC_ERR_SUP,"Blocksize not supported");
1933:   }
1934:   ctx->view      = VecScatterView_MPI;

1936:   /* Check if the local scatter is actually a copy; important special case */
1937:   if (to->local.n) {
1938:     VecScatterLocalOptimizeCopy_Private(ctx,&to->local,&from->local,bs);
1939:   }
1940:   return(0);
1941: }



1945: /* ------------------------------------------------------------------------------------*/
1946: /*
1947:          Scatter from local Seq vectors to a parallel vector. 
1948:          Reverses the order of the arguments, calls VecScatterCreate_PtoS() then
1949:          reverses the result.
1950: */
1953: PetscErrorCode VecScatterCreate_StoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
1954: {
1955:   PetscErrorCode         ierr;
1956:   MPI_Request            *waits;
1957:   VecScatter_MPI_General *to,*from;

1960:   VecScatterCreate_PtoS(ny,inidy,nx,inidx,yin,xin,bs,ctx);
1961:   to            = (VecScatter_MPI_General*)ctx->fromdata;
1962:   from          = (VecScatter_MPI_General*)ctx->todata;
1963:   ctx->todata   = (void*)to;
1964:   ctx->fromdata = (void*)from;
1965:   /* these two are special, they are ALWAYS stored in to struct */
1966:   to->sstatus   = from->sstatus;
1967:   to->rstatus   = from->rstatus;

1969:   from->sstatus = 0;
1970:   from->rstatus = 0;

1972:   waits              = from->rev_requests;
1973:   from->rev_requests = from->requests;
1974:   from->requests     = waits;
1975:   waits              = to->rev_requests;
1976:   to->rev_requests   = to->requests;
1977:   to->requests       = waits;
1978:   return(0);
1979: }

1981: /* ---------------------------------------------------------------------------------*/
1984: PetscErrorCode VecScatterCreate_PtoP(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,VecScatter ctx)
1985: {
1987:   PetscMPIInt    size,rank,tag,imdex,n;
1988:   PetscInt       *lens = PETSC_NULL,*owners = xin->map.range;
1989:   PetscInt       *nprocs = PETSC_NULL,i,j,idx,nsends,nrecvs,*local_inidx = PETSC_NULL,*local_inidy = PETSC_NULL;
1990:   PetscInt       *owner = PETSC_NULL,*starts = PETSC_NULL,count,slen;
1991:   PetscInt       *rvalues = PETSC_NULL,*svalues = PETSC_NULL,base,nmax,*values = PETSC_NULL,lastidx;
1992:   MPI_Comm       comm;
1993:   MPI_Request    *send_waits = PETSC_NULL,*recv_waits = PETSC_NULL;
1994:   MPI_Status     recv_status,*send_status = PETSC_NULL;
1995:   PetscTruth     duplicate = PETSC_FALSE;
1996: #if defined(PETSC_USE_DEBUG)
1997:   PetscTruth     found = PETSC_FALSE;
1998: #endif

2001:   PetscObjectGetNewTag((PetscObject)ctx,&tag);
2002:   PetscObjectGetComm((PetscObject)xin,&comm);
2003:   MPI_Comm_size(comm,&size);
2004:   MPI_Comm_rank(comm,&rank);
2005:   if (size == 1) {
2006:     VecScatterCreate_StoP(nx,inidx,ny,inidy,xin,yin,1,ctx);
2007:     return(0);
2008:   }

2010:   /*
2011:      Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2012:      They then call the StoPScatterCreate()
2013:   */
2014:   /*  first count number of contributors to each processor */
2015:   PetscMalloc3(2*size,PetscInt,&nprocs,nx,PetscInt,&owner,(size+1),PetscInt,&starts);
2016:   PetscMemzero(nprocs,2*size*sizeof(PetscInt));
2017:   lastidx = -1;
2018:   j       = 0;
2019:   for (i=0; i<nx; i++) {
2020:     /* if indices are NOT locally sorted, need to start search at the beginning */
2021:     if (lastidx > (idx = inidx[i])) j = 0;
2022:     lastidx = idx;
2023:     for (; j<size; j++) {
2024:       if (idx >= owners[j] && idx < owners[j+1]) {
2025:         nprocs[2*j]++;
2026:         nprocs[2*j+1] = 1;
2027:         owner[i] = j;
2028: #if defined(PETSC_USE_DEBUG)
2029:         found = PETSC_TRUE;
2030: #endif
2031:         break;
2032:       }
2033:     }
2034: #if defined(PETSC_USE_DEBUG)
2035:     if (!found) SETERRQ1(PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2036:     found = PETSC_FALSE;
2037: #endif
2038:   }
2039:   nsends = 0;  for (i=0; i<size; i++) { nsends += nprocs[2*i+1];}

2041:   /* inform other processors of number of messages and max length*/
2042:   PetscMaxSum(comm,nprocs,&nmax,&nrecvs);

2044:   /* post receives:   */
2045:   PetscMalloc6(2*nrecvs*nmax,PetscInt,&rvalues,2*nx,PetscInt,&svalues,2*nrecvs,PetscInt,&lens,nrecvs,MPI_Request,&recv_waits,nsends,MPI_Request,&send_waits,nsends,MPI_Status,&send_status);

2047:   for (i=0; i<nrecvs; i++) {
2048:     MPI_Irecv(rvalues+2*nmax*i,2*nmax,MPIU_INT,MPI_ANY_SOURCE,tag,comm,recv_waits+i);
2049:   }

2051:   /* do sends:
2052:       1) starts[i] gives the starting index in svalues for stuff going to 
2053:          the ith processor
2054:   */
2055:   starts[0]= 0;
2056:   for (i=1; i<size; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2057:   for (i=0; i<nx; i++) {
2058:     svalues[2*starts[owner[i]]]       = inidx[i];
2059:     svalues[1 + 2*starts[owner[i]]++] = inidy[i];
2060:   }

2062:   starts[0] = 0;
2063:   for (i=1; i<size+1; i++) { starts[i] = starts[i-1] + nprocs[2*i-2];}
2064:   count = 0;
2065:   for (i=0; i<size; i++) {
2066:     if (nprocs[2*i+1]) {
2067:       MPI_Isend(svalues+2*starts[i],2*nprocs[2*i],MPIU_INT,i,tag,comm,send_waits+count);
2068:       count++;
2069:     }
2070:   }
2071:   PetscFree3(nprocs,owner,starts);

2073:   /*  wait on receives */
2074:   count = nrecvs;
2075:   slen  = 0;
2076:   while (count) {
2077:     MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);
2078:     /* unpack receives into our local space */
2079:     MPI_Get_count(&recv_status,MPIU_INT,&n);
2080:     lens[imdex]  =  n/2;
2081:     slen         += n/2;
2082:     count--;
2083:   }
2084: 
2085:   PetscMalloc2(slen,PetscInt,&local_inidx,slen,PetscInt,&local_inidy);
2086:   base  = owners[rank];
2087:   count = 0;
2088:   for (i=0; i<nrecvs; i++) {
2089:     values = rvalues + 2*i*nmax;
2090:     for (j=0; j<lens[i]; j++) {
2091:       local_inidx[count]   = values[2*j] - base;
2092:       local_inidy[count++] = values[2*j+1];
2093:     }
2094:   }

2096:   /* wait on sends */
2097:   if (nsends) {MPI_Waitall(nsends,send_waits,send_status);}
2098:   PetscFree6(rvalues,svalues,lens,recv_waits,send_waits,send_status);

2100:   /*
2101:      should sort and remove duplicates from local_inidx,local_inidy 
2102:   */

2104: #if defined(do_it_slow)
2105:   /* sort on the from index */
2106:   PetscSortIntWithArray(slen,local_inidx,local_inidy);
2107:   start = 0;
2108:   while (start < slen) {
2109:     count = start+1;
2110:     last  = local_inidx[start];
2111:     while (count < slen && last == local_inidx[count]) count++;
2112:     if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2113:       /* sort on to index */
2114:       PetscSortInt(count-start,local_inidy+start);
2115:     }
2116:     /* remove duplicates; not most efficient way, but probably good enough */
2117:     i = start;
2118:     while (i < count-1) {
2119:       if (local_inidy[i] != local_inidy[i+1]) {
2120:         i++;
2121:       } else { /* found a duplicate */
2122:         duplicate = PETSC_TRUE;
2123:         for (j=i; j<slen-1; j++) {
2124:           local_inidx[j] = local_inidx[j+1];
2125:           local_inidy[j] = local_inidy[j+1];
2126:         }
2127:         slen--;
2128:         count--;
2129:       }
2130:     }
2131:     start = count;
2132:   }
2133: #endif
2134:   if (duplicate) {
2135:     PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");
2136:   }
2137:   VecScatterCreate_StoP(slen,local_inidx,slen,local_inidy,xin,yin,1,ctx);
2138:   PetscFree2(local_inidx,local_inidy);
2139:   return(0);
2140: }