Actual source code: gs.c

  2: /***********************************gs.c***************************************
  3: SPARSE GATHER-SCATTER PACKAGE: bss_malloc bss_malloc ivec error comm gs queue

  5: Author: Henry M. Tufo III

  7: e-mail: hmt@cs.brown.edu

  9: snail-mail:
 10: Division of Applied Mathematics
 11: Brown University
 12: Providence, RI 02912

 14: Last Modification: 
 15: 6.21.97
 16: ************************************gs.c**************************************/

 18: /***********************************gs.c***************************************
 19: File Description:
 20: -----------------

 22: ************************************gs.c**************************************/

 24:  #include petsc.h
 25: #if defined(PETSC_HAVE_STRINGS_H)
 26: #include <strings.h>
 27: #endif
 28: #if defined(PETSC_HAVE_STRING_H)
 29: #include <string.h>
 30: #endif

 32: #include <float.h>
 33: #include <limits.h>

 35:  #include const.h
 36:  #include types.h
 37:  #include comm.h
 38:  #include ivec.h
 39: #include "bss_malloc.h"
 40: #include "bit_mask.h"
 41:  #include error.h
 42:  #include queue.h
 43:  #include blas.h
 44:  #include gs.h

 46: /* default length of number of items via tree - doubles if exceeded */
 47: #define TREE_BUF_SZ 2048;
 48: #define GS_VEC_SZ   1



 52: /***********************************gs.c***************************************
 53: Type: struct gather_scatter_id 
 54: ------------------------------

 56: ************************************gs.c**************************************/
 57: typedef struct gather_scatter_id {
 58:   int id;
 59:   int nel_min;
 60:   int nel_max;
 61:   int nel_sum;
 62:   int negl;
 63:   int gl_max;
 64:   int gl_min;
 65:   int repeats;
 66:   int ordered;
 67:   int positive;
 68:   REAL *vals;

 70:   /* bit mask info */
 71:   int *my_proc_mask;
 72:   int mask_sz;
 73:   int *ngh_buf;
 74:   int ngh_buf_sz;
 75:   int *nghs;
 76:   int num_nghs;
 77:   int max_nghs;
 78:   int *pw_nghs;
 79:   int num_pw_nghs;
 80:   int *tree_nghs;
 81:   int num_tree_nghs;

 83:   int num_loads;

 85:   /* repeats == true -> local info */
 86:   int nel;         /* number of unique elememts */
 87:   int *elms;       /* of size nel */
 88:   int nel_total;
 89:   int *local_elms; /* of size nel_total */
 90:   int *companion;  /* of size nel_total */

 92:   /* local info */
 93:   int num_local_total;
 94:   int local_strength;
 95:   int num_local;
 96:   int *num_local_reduce;
 97:   int **local_reduce;
 98:   int num_local_gop;
 99:   int *num_gop_local_reduce;
100:   int **gop_local_reduce;

102:   /* pairwise info */
103:   int level;
104:   int num_pairs;
105:   int max_pairs;
106:   int loc_node_pairs;
107:   int max_node_pairs;
108:   int min_node_pairs;
109:   int avg_node_pairs;
110:   int *pair_list;
111:   int *msg_sizes;
112:   int **node_list;
113:   int len_pw_list;
114:   int *pw_elm_list;
115:   REAL *pw_vals;

117:   MPI_Request *msg_ids_in;
118:   MPI_Request *msg_ids_out;

120:   REAL *out;
121:   REAL *in;
122:   int msg_total;

124:   /* tree - crystal accumulator info */
125:   int max_left_over;
126:   int *pre;
127:   int *in_num;
128:   int *out_num;
129:   int **in_list;
130:   int **out_list;

132:   /* new tree work*/
133:   int  tree_nel;
134:   int *tree_elms;
135:   REAL *tree_buf;
136:   REAL *tree_work;

138:   int  tree_map_sz;
139:   int *tree_map_in;
140:   int *tree_map_out;

142:   /* current memory status */
143:   int gl_bss_min;
144:   int gl_perm_min;

146:   /* max segment size for gs_gop_vec() */
147:   int vec_sz;

149:   /* hack to make paul happy */
150:   MPI_Comm gs_comm;

152: } gs_id;


155: /* to be made public */
156: #if defined(not_used)
157: static PetscErrorCode  gs_dump_ngh(gs_id *id, int loc_num, int *num, int *ngh_list);
158: static void gsi_via_int_list(gs_id *gs);
159: static PetscErrorCode in_sub_tree(int *ptr3, int p_mask_size, int *buf2, int buf_size);
160: #endif

162: /* PRIVATE - and definitely not exported */
163: /*static void gs_print_template(register gs_id* gs, int who);*/
164: /*static void gs_print_stemplate(register gs_id* gs, int who);*/

166: static gs_id *gsi_check_args(int *elms, int nel, int level);
167: static void gsi_via_bit_mask(gs_id *gs);
168: static void get_ngh_buf(gs_id *gs);
169: static void set_pairwise(gs_id *gs);
170: static gs_id * gsi_new(void);
171: static void set_tree(gs_id *gs);

173: /* same for all but vector flavor */
174: static void gs_gop_local_out(gs_id *gs, REAL *vals);
175: /* vector flavor */
176: static void gs_gop_vec_local_out(gs_id *gs, REAL *vals, int step);

178: static void gs_gop_vec_plus(gs_id *gs, REAL *in_vals, int step);
179: static void gs_gop_vec_pairwise_plus(gs_id *gs, REAL *in_vals, int step);
180: static void gs_gop_vec_local_plus(gs_id *gs, REAL *vals, int step);
181: static void gs_gop_vec_local_in_plus(gs_id *gs, REAL *vals, int step);
182: static void gs_gop_vec_tree_plus(gs_id *gs, REAL *vals, int step);


185: static void gs_gop_plus(gs_id *gs, REAL *in_vals);
186: static void gs_gop_pairwise_plus(gs_id *gs, REAL *in_vals);
187: static void gs_gop_local_plus(gs_id *gs, REAL *vals);
188: static void gs_gop_local_in_plus(gs_id *gs, REAL *vals);
189: static void gs_gop_tree_plus(gs_id *gs, REAL *vals);

191: static void gs_gop_plus_hc(gs_id *gs, REAL *in_vals, int dim);
192: static void gs_gop_pairwise_plus_hc(gs_id *gs, REAL *in_vals, int dim);
193: static void gs_gop_tree_plus_hc(gs_id *gs, REAL *vals, int dim);

195: static void gs_gop_times(gs_id *gs, REAL *in_vals);
196: static void gs_gop_pairwise_times(gs_id *gs, REAL *in_vals);
197: static void gs_gop_local_times(gs_id *gs, REAL *vals);
198: static void gs_gop_local_in_times(gs_id *gs, REAL *vals);
199: static void gs_gop_tree_times(gs_id *gs, REAL *vals);

201: static void gs_gop_min(gs_id *gs, REAL *in_vals);
202: static void gs_gop_pairwise_min(gs_id *gs, REAL *in_vals);
203: static void gs_gop_local_min(gs_id *gs, REAL *vals);
204: static void gs_gop_local_in_min(gs_id *gs, REAL *vals);
205: static void gs_gop_tree_min(gs_id *gs, REAL *vals);

207: static void gs_gop_min_abs(gs_id *gs, REAL *in_vals);
208: static void gs_gop_pairwise_min_abs(gs_id *gs, REAL *in_vals);
209: static void gs_gop_local_min_abs(gs_id *gs, REAL *vals);
210: static void gs_gop_local_in_min_abs(gs_id *gs, REAL *vals);
211: static void gs_gop_tree_min_abs(gs_id *gs, REAL *vals);

213: static void gs_gop_max(gs_id *gs, REAL *in_vals);
214: static void gs_gop_pairwise_max(gs_id *gs, REAL *in_vals);
215: static void gs_gop_local_max(gs_id *gs, REAL *vals);
216: static void gs_gop_local_in_max(gs_id *gs, REAL *vals);
217: static void gs_gop_tree_max(gs_id *gs, REAL *vals);

219: static void gs_gop_max_abs(gs_id *gs, REAL *in_vals);
220: static void gs_gop_pairwise_max_abs(gs_id *gs, REAL *in_vals);
221: static void gs_gop_local_max_abs(gs_id *gs, REAL *vals);
222: static void gs_gop_local_in_max_abs(gs_id *gs, REAL *vals);
223: static void gs_gop_tree_max_abs(gs_id *gs, REAL *vals);

225: static void gs_gop_exists(gs_id *gs, REAL *in_vals);
226: static void gs_gop_pairwise_exists(gs_id *gs, REAL *in_vals);
227: static void gs_gop_local_exists(gs_id *gs, REAL *vals);
228: static void gs_gop_local_in_exists(gs_id *gs, REAL *vals);
229: static void gs_gop_tree_exists(gs_id *gs, REAL *vals);

231: static void gs_gop_pairwise_binary(gs_id *gs, REAL *in_vals, rbfp fct);
232: static void gs_gop_local_binary(gs_id *gs, REAL *vals, rbfp fct);
233: static void gs_gop_local_in_binary(gs_id *gs, REAL *vals, rbfp fct);
234: static void gs_gop_tree_binary(gs_id *gs, REAL *vals, rbfp fct);



238: /* global vars */
239: /* from comm.c module */

241: /* module state inf and fortran interface */
242: static int num_gs_ids = 0;

244: /* should make this dynamic ... later */
245: /*static queue_ADT elms_q, mask_q;*/
246: static int msg_buf=MAX_MSG_BUF;
247: /*static int msg_ch=FALSE;*/

249: static int vec_sz=GS_VEC_SZ;
250: /*static int vec_ch=FALSE; */

252: static int *tree_buf=NULL;
253: static int tree_buf_sz=0;
254: static int ntree=0;


257: /******************************************************************************
258: Function: gs_init_()

260: Input : 
261: Output: 
262: Return: 
263: Description:  
264: ******************************************************************************/
265: void gs_init_vec_sz(int size)
266: {
267:   /*  vec_ch = TRUE; */

269:   vec_sz = size;
270: }

272: /******************************************************************************
273: Function: gs_init_()

275: Input : 
276: Output: 
277: Return: 
278: Description:  
279: ******************************************************************************/
280: void gs_init_msg_buf_sz(int buf_size)
281: {
282:   /*  msg_ch = TRUE; */

284:   msg_buf = buf_size;
285: }

287: /******************************************************************************
288: Function: gs_init()

290: Input : 

292: Output: 

294: RETURN: 

296: Description:  
297: ******************************************************************************/
298: gs_id *
299: gs_init(register int *elms, int nel, int level)
300: {
301:   register gs_id *gs;
302: #ifdef INFO1
303:   int i;
304: #endif
305:   MPI_Group gs_group;
306:   MPI_Comm  gs_comm;


309:   bss_init();
310:   perm_init();

312: #ifdef DEBUG  
313:   error_msg_warning("gs_init() start w/(%d,%d)\n",my_id,num_nodes);
314: #endif
315: 
316:   /* ensure that communication package has been initialized */
317:   comm_init();

319: #ifdef INFO1
320:   bss_stats();
321:   perm_stats();
322: #endif

324:   /* determines if we have enough dynamic/semi-static memory */
325:   /* checks input, allocs and sets gd_id template            */
326:   gs = gsi_check_args(elms,nel,level);

328:   /* only bit mask version up and working for the moment    */
329:   /* LATER :: get int list version working for sparse pblms */
330:   gsi_via_bit_mask(gs);

332: #ifdef INFO1
333:   /* print out P0's template as well as malloc stats */
334:   gs_print_template(gs,0);

336:   MPI_Barrier(MPI_COMM_WORLD);

338:   for (i=1;i<num_nodes;i++)
339:     {
340:       gs_print_stemplate(gs,i);
341:       MPI_Barrier(MPI_COMM_WORLD);
342:     }
343: #endif

345: #ifdef DEBUG  
346:   error_msg_warning("gs_init() end w/(%d,%d)\n",my_id,num_nodes);
347: #endif

349: #ifdef INFO
350:   bss_stats();
351:   perm_stats();
352: #endif

354:   MPI_Comm_group(MPI_COMM_WORLD,&gs_group);
355:   MPI_Comm_create(MPI_COMM_WORLD,gs_group,&gs_comm);
356:   gs->gs_comm=gs_comm;

358:   return(gs);
359: }



363: /******************************************************************************
364: Function: gsi_new()

366: Input : 
367: Output: 
368: Return: 
369: Description: 

371: elm list must >= 0!!!
372: elm repeats allowed
373: ******************************************************************************/
374: static
375: gs_id *
376: gsi_new(void)
377: {
378:   int size=sizeof(gs_id);
379:   gs_id *gs;

381: 
382: #ifdef DEBUG
383:   error_msg_warning("gsi_new() :: size=%D\n",size);
384: #endif

386:   gs = (gs_id *) perm_malloc(size);
387: 
388:   if (!(size%REAL_LEN))
389:     {rvec_zero((REAL *)gs,size/REAL_LEN);}
390:   else if (!(size%INT_LEN))
391:     {ivec_zero((INT*)gs,size/INT_LEN);}
392:   else
393:     {memset((char *)gs,0,size/sizeof(char));}

395:   return(gs);
396: }



400: /******************************************************************************
401: Function: gsi_check_args()

403: Input : 
404: Output: 
405: Return: 
406: Description: 

408: elm list must >= 0!!!
409: elm repeats allowed
410: local working copy of elms is sorted
411: ******************************************************************************/
412: static
413: gs_id *
414: gsi_check_args(int *in_elms, int nel, int level)
415: {
416:   register int i, j, k, t2;
417:   int *companion, *elms, *unique, *iptr;
418:   int num_local=0, *num_to_reduce, **local_reduce;
419:   int oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
420:   int vals[sizeof(oprs)/sizeof(oprs[0])-1];
421:   int work[sizeof(oprs)/sizeof(oprs[0])-1];
422:   gs_id *gs;


425: #ifdef DEBUG  
426:   error_msg_warning("gs_check_args() begin w/(%d,%d)\n",my_id,num_nodes);
427: #endif

429: #ifdef SAFE
430:   if (!in_elms)
431:     {error_msg_fatal("elms point to nothing!!!\n");}

433:   if (nel<0)
434:     {error_msg_fatal("can't have fewer than 0 elms!!!\n");}

436:   if (nel==0)
437:     {error_msg_warning("I don't have any elements!!!\n");}
438: #endif

440:   /* get space for gs template */
441:   gs = gsi_new();
442:   gs->id = ++num_gs_ids;

444:   /* hmt 6.4.99                                            */
445:   /* caller can set global ids that don't participate to 0 */
446:   /* gs_init ignores all zeros in elm list                 */
447:   /* negative global ids are still invalid                 */
448:   for (i=j=0;i<nel;i++)
449:     {if (in_elms[i]!=0) {j++;}}

451:   k=nel; nel=j;

453:   /* copy over in_elms list and create inverse map */
454:   elms = (int*) bss_malloc((nel+1)*INT_LEN);
455:   companion = (int*) bss_malloc(nel*INT_LEN);
456:   /* ivec_c_index(companion,nel); */
457:   /* ivec_copy(elms,in_elms,nel); */
458:   for (i=j=0;i<k;i++)
459:     {
460:       if (in_elms[i]!=0)
461:         {elms[j] = in_elms[i]; companion[j++] = i;}
462:     }

464:   if (j!=nel)
465:     {error_msg_fatal("nel j mismatch!\n");}

467: #ifdef SAFE
468:   /* pre-pass ... check to see if sorted */
469:   elms[nel] = INT_MAX;
470:   iptr = elms;
471:   unique = elms+1;
472:   j=0;
473:   while (*iptr!=INT_MAX)
474:     {
475:       if (*iptr++>*unique++)
476:         {j=1; break;}
477:     }

479:   /* set up inverse map */
480:   if (j)
481:     {
482:       error_msg_warning("gsi_check_args() :: elm list *not* sorted!\n");
483:       SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);
484:     }
485:   else
486:     {error_msg_warning("gsi_check_args() :: elm list sorted!\n");}
487: #else
488:   SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);
489: #endif
490:   elms[nel] = INT_MIN;

492:   /* first pass */
493:   /* determine number of unique elements, check pd */
494:   for (i=k=0;i<nel;i+=j)
495:     {
496:       t2 = elms[i];
497:       j=++i;
498: 
499:       /* clump 'em for now */
500:       while (elms[j]==t2) {j++;}
501: 
502:       /* how many together and num local */
503:       if (j-=i)
504:         {num_local++; k+=j;}
505:     }

507:   /* how many unique elements? */
508:   gs->repeats=k;
509:   gs->nel = nel-k;


512:   /* number of repeats? */
513:   gs->num_local = num_local;
514:   num_local+=2;
515:   gs->local_reduce=local_reduce=(int **)perm_malloc(num_local*INT_PTR_LEN);
516:   gs->num_local_reduce=num_to_reduce=(int*) perm_malloc(num_local*INT_LEN);

518:   unique = (int*) bss_malloc((gs->nel+1)*INT_LEN);
519:   gs->elms = unique;
520:   gs->nel_total = nel;
521:   gs->local_elms = elms;
522:   gs->companion = companion;

524:   /* compess map as well as keep track of local ops */
525:   for (num_local=i=j=0;i<gs->nel;i++)
526:     {
527:       k=j;
528:       t2 = unique[i] = elms[j];
529:       companion[i] = companion[j];
530: 
531:       while (elms[j]==t2) {j++;}

533:       if ((t2=(j-k))>1)
534:         {
535:           /* number together */
536:           num_to_reduce[num_local] = t2++;
537:           iptr = local_reduce[num_local++] = (int*)perm_malloc(t2*INT_LEN);

539:           /* to use binary searching don't remap until we check intersection */
540:           *iptr++ = i;
541: 
542:           /* note that we're skipping the first one */
543:           while (++k<j)
544:             {*(iptr++) = companion[k];}
545:           *iptr = -1;
546:         }
547:     }

549:   /* sentinel for ngh_buf */
550:   unique[gs->nel]=INT_MAX;

552: #ifdef DEBUG
553:   if (num_local!=gs->num_local)
554:     {error_msg_fatal("compression of maps wrong!!!\n");}
555: #endif

557:   /* for two partition sort hack */
558:   num_to_reduce[num_local] = 0;
559:   local_reduce[num_local] = NULL;
560:   num_to_reduce[++num_local] = 0;
561:   local_reduce[num_local] = NULL;

563:   /* load 'em up */
564:   /* note one extra to hold NON_UNIFORM flag!!! */
565:   vals[2] = vals[1] = vals[0] = nel;
566:   if (gs->nel>0)
567:     {
568:        vals[3] = unique[0];           /* ivec_lb(elms,nel); */
569:        vals[4] = unique[gs->nel-1];   /* ivec_ub(elms,nel); */
570:     }
571:   else
572:     {
573:        vals[3] = INT_MAX;             /* ivec_lb(elms,nel); */
574:        vals[4] = INT_MIN;             /* ivec_ub(elms,nel); */
575:     }
576:   vals[5] = level;
577:   vals[6] = num_gs_ids;

579:   /* GLOBAL: send 'em out */
580:   giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);

582:   /* must be semi-pos def - only pairwise depends on this */
583:   /* LATER - remove this restriction */
584:   if (vals[3]<0)
585:     {error_msg_fatal("gsi_check_args() :: system not semi-pos def ::%d\n",vals[3]);}

587:   if (vals[4]==INT_MAX)
588:     {error_msg_fatal("gsi_check_args() :: system ub too large ::%d!\n",vals[4]);}

590: #ifdef DEBUG
591:   /* check gs template count */
592:   if (vals[6] != num_gs_ids)
593:     {error_msg_fatal("num_gs_ids mismatch!!!");}

595:   /* check all have same level threshold */
596:   if (level != vals[5])
597:     {error_msg_fatal("gsi_check_args() :: level not uniform across nodes!!!\n");}
598: #endif

600:   gs->nel_min = vals[0];
601:   gs->nel_max = vals[1];
602:   gs->nel_sum = vals[2];
603:   gs->gl_min  = vals[3];
604:   gs->gl_max  = vals[4];
605:   gs->negl    = vals[4]-vals[3]+1;

607: #ifdef DEBUG
608:       printf("nel(unique)=%d\n", gs->nel);
609:       printf("nel_max=%d\n",     gs->nel_max);
610:       printf("nel_min=%d\n",     gs->nel_min);
611:       printf("nel_sum=%d\n",     gs->nel_sum);
612:       printf("negl=%d\n",        gs->negl);
613:       printf("gl_max=%d\n",      gs->gl_max);
614:       printf("gl_min=%d\n",      gs->gl_min);
615:       printf("elms ordered=%d\n",gs->ordered);
616:       printf("repeats=%d\n",     gs->repeats);
617:       printf("positive=%d\n",    gs->positive);
618:       printf("level=%d\n",       gs->level);
619: #endif

621:   if (gs->negl<=0)
622:     {error_msg_fatal("gsi_check_args() :: system empty or neg :: %d\n",gs->negl);}
623: 
624:   /* LATER :: add level == -1 -> program selects level */
625:   if (vals[5]<0)
626:     {vals[5]=0;}
627:   else if (vals[5]>num_nodes)
628:     {vals[5]=num_nodes;}
629:   gs->level = vals[5];

631: #ifdef DEBUG  
632:   error_msg_warning("gs_check_args() :: end w/(%d,%d)+level=%d\n",
633:                     my_id,num_nodes,vals[5]);
634: #endif

636:   return(gs);
637: }


640: /******************************************************************************
641: Function: gsi_via_bit_mask()

643: Input : 
644: Output: 
645: Return: 
646: Description: 


649: ******************************************************************************/
650: static
651: void
652: gsi_via_bit_mask(gs_id *gs)
653: {
654:   register int i, nel, *elms;
655:   int t1;
656:   int **reduce;
657:   int *map;

659: #ifdef DEBUG  
660:   error_msg_warning("gsi_via_bit_mask() begin w/%d :: %d\n",my_id,num_nodes);
661: #endif

663:   /* totally local removes ... ct_bits == 0 */
664:   get_ngh_buf(gs);

666:   if (gs->level)
667:     {set_pairwise(gs);}

669:   if (gs->max_left_over)
670:     {set_tree(gs);}

672:   /* intersection local and pairwise/tree? */
673:   gs->num_local_total = gs->num_local;
674:   gs->gop_local_reduce = gs->local_reduce;
675:   gs->num_gop_local_reduce = gs->num_local_reduce;

677:   map = gs->companion;

679:   /* is there any local compression */
680:   if (!gs->num_local) {
681:     gs->local_strength = NONE;
682:     gs->num_local_gop = 0;
683:   } else {
684:       /* ok find intersection */
685:       map = gs->companion;
686:       reduce = gs->local_reduce;
687:       for (i=0, t1=0; i<gs->num_local; i++, reduce++)
688:         {
689:           if ((ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
690:               ||
691:               ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
692:             {
693:               /* printf("C%d :: i=%d, **reduce=%d\n",my_id,i,**reduce); */
694:               t1++;
695:               if (gs->num_local_reduce[i]<=0)
696:                 {error_msg_fatal("nobody in list?");}
697:               gs->num_local_reduce[i] *= -1;
698:             }
699:            **reduce=map[**reduce];
700:         }

702:       /* intersection is empty */
703:       if (!t1)
704:         {
705: #ifdef DEBUG          
706:           error_msg_warning("gsi_check_args() :: local gs_gop w/o intersection!");
707: #endif
708:           gs->local_strength = FULL;
709:           gs->num_local_gop = 0;
710:         }
711:       /* intersection not empty */
712:       else
713:         {
714: #ifdef DEBUG          
715:           error_msg_warning("gsi_check_args() :: local gs_gop w/intersection!");
716: #endif
717:           gs->local_strength = PARTIAL;
718:           SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce,
719:                    gs->num_local + 1, SORT_INT_PTR);

721:           gs->num_local_gop = t1;
722:           gs->num_local_total =  gs->num_local;
723:           gs->num_local    -= t1;
724:           gs->gop_local_reduce = gs->local_reduce;
725:           gs->num_gop_local_reduce = gs->num_local_reduce;

727:           for (i=0; i<t1; i++)
728:             {
729:               if (gs->num_gop_local_reduce[i]>=0)
730:                 {error_msg_fatal("they aren't negative?");}
731:               gs->num_gop_local_reduce[i] *= -1;
732:               gs->local_reduce++;
733:               gs->num_local_reduce++;
734:             }
735:           gs->local_reduce++;
736:           gs->num_local_reduce++;
737:         }
738:     }

740:   elms = gs->pw_elm_list;
741:   nel  = gs->len_pw_list;
742:   for (i=0; i<nel; i++)
743:     {elms[i] = map[elms[i]];}

745:   elms = gs->tree_map_in;
746:   nel  = gs->tree_map_sz;
747:   for (i=0; i<nel; i++)
748:     {elms[i] = map[elms[i]];}

750:   /* clean up */
751:   bss_free((void*) gs->local_elms);
752:   bss_free((void*) gs->companion);
753:   bss_free((void*) gs->elms);
754:   bss_free((void*) gs->ngh_buf);
755:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;

757: #ifdef DEBUG  
758:   error_msg_warning("gsi_via_bit_mask() end w/%d :: %d\n",my_id,num_nodes);
759: #endif
760: }



764: /******************************************************************************
765: Function: place_in_tree()

767: Input : 
768: Output: 
769: Return: 
770: Description: 


773: ******************************************************************************/
774: static
775: void
776: place_in_tree(register int elm)
777: {
778:   register int *tp, n;


781:   if (ntree==tree_buf_sz)
782:     {
783:       if (tree_buf_sz)
784:         {
785:           tp = tree_buf;
786:           n = tree_buf_sz;
787:           tree_buf_sz<<=1;
788:           tree_buf = (int*)bss_malloc(tree_buf_sz*INT_LEN);
789:           ivec_copy(tree_buf,tp,n);
790:           bss_free(tp);
791:         }
792:       else
793:         {
794:           tree_buf_sz = TREE_BUF_SZ;
795:           tree_buf = (int*)bss_malloc(tree_buf_sz*INT_LEN);
796:         }
797:     }

799:   tree_buf[ntree++] = elm;
800: }



804: /******************************************************************************
805: Function: get_ngh_buf()

807: Input : 
808: Output: 
809: Return: 
810: Description: 


813: ******************************************************************************/
814: static
815: void
816: get_ngh_buf(gs_id *gs)
817: {
818:   register int i, j, npw=0, ntree_map=0;
819:   int p_mask_size, ngh_buf_size, buf_size;
820:   int *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
821:   int *ngh_buf, *buf1, *buf2;
822:   int offset, per_load, num_loads, or_ct, start, end;
823:   int *ptr1, *ptr2, i_start, negl, nel, *elms;
824:   int oper=GL_B_OR;
825:   int *ptr3, *t_mask, level, ct1, ct2;

827: #ifdef DEBUG  
828:   error_msg_warning("get_ngh_buf() begin w/%d :: %d\n",my_id,num_nodes);
829: #endif

831:   /* to make life easier */
832:   nel   = gs->nel;
833:   elms  = gs->elms;
834:   level = gs->level;
835: 
836:   /* det #bytes needed for processor bit masks and init w/mask cor. to my_id */
837:   p_mask = (int*) bss_malloc(p_mask_size=len_bit_mask(num_nodes));
838:   set_bit_mask(p_mask,p_mask_size,my_id);

840:   /* allocate space for masks and info bufs */
841:   gs->nghs = sh_proc_mask = (int*) bss_malloc(p_mask_size);
842:   gs->pw_nghs = pw_sh_proc_mask = (int*) perm_malloc(p_mask_size);
843:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
844:   t_mask = (int*) bss_malloc(p_mask_size);
845:   gs->ngh_buf = ngh_buf = (int*) bss_malloc(ngh_buf_size);

847:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
848:   /* had thought I could exploit rendezvous threshold */

850:   /* default is one pass */
851:   per_load = negl  = gs->negl;
852:   gs->num_loads = num_loads = 1;
853:   i=p_mask_size*negl;

855:   /* possible overflow on buffer size */
856:   /* overflow hack                    */
857:   if (i<0) {i=INT_MAX;}

859:   buf_size = PetscMin(msg_buf,i);

861:   /* can we do it? */
862:   if (p_mask_size>buf_size)
863:     {error_msg_fatal("get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);}

865:   /* get giop buf space ... make *only* one malloc */
866:   buf1 = (int*) bss_malloc(buf_size<<1);

868:   /* more than one gior exchange needed? */
869:   if (buf_size!=i)
870:     {
871:       per_load = buf_size/p_mask_size;
872:       buf_size = per_load*p_mask_size;
873:       gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
874:     }

876: #ifdef DEBUG  
877:   /* dump some basic info */
878:   error_msg_warning("n_lds=%d,pms=%d,buf_sz=%d\n",num_loads,p_mask_size,buf_size);
879: #endif

881:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
882: #ifdef SAFE  
883:   p_mask_size/=INT_LEN; ngh_buf_size/=INT_LEN; buf_size/=INT_LEN;
884: #else  
885:   p_mask_size>>=2; ngh_buf_size>>=2; buf_size>>=2;
886: #endif
887: 
888:   /* find giop work space */
889:   buf2 = buf1+buf_size;

891:   /* hold #ints needed for processor masks */
892:   gs->mask_sz=p_mask_size;

894:   /* init buffers */
895:   ivec_zero(sh_proc_mask,p_mask_size);
896:   ivec_zero(pw_sh_proc_mask,p_mask_size);
897:   ivec_zero(ngh_buf,ngh_buf_size);

899:   /* HACK reset tree info */
900:   tree_buf=NULL;
901:   tree_buf_sz=ntree=0;

903:   /* queue the tree elements for now */
904:   /* elms_q = new_queue(); */
905: 
906:   /* can also queue tree info for pruned or forest implememtation */
907:   /*  mask_q = new_queue(); */

909:   /* ok do it */
910:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
911:     {
912:       /* identity for bitwise or is 000...000 */
913:       ivec_zero(buf1,buf_size);

915:       /* load msg buffer */
916:       for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
917:         {
918:           offset = (offset-start)*p_mask_size;
919:           ivec_copy(buf1+offset,p_mask,p_mask_size);
920:         }

922:       /* GLOBAL: pass buffer */
923:       giop(buf1,buf2,buf_size,&oper);


926:       /* unload buffer into ngh_buf */
927:       ptr2=(elms+i_start);
928:       for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
929:         {
930:           /* I own it ... may have to pairwise it */
931:           if (j==*ptr2)
932:             {
933:               /* do i share it w/anyone? */
934: #ifdef SAFE
935:               ct1 = ct_bits((char *)ptr3,p_mask_size*INT_LEN);
936: #else
937:               ct1 = ct_bits((char *)ptr3,p_mask_size<<2);
938: #endif
939:               /* guess not */
940:               if (ct1<2)
941:                 {ptr2++; ptr1+=p_mask_size; continue;}

943:               /* i do ... so keep info and turn off my bit */
944:               ivec_copy(ptr1,ptr3,p_mask_size);
945:               ivec_xor(ptr1,p_mask,p_mask_size);
946:               ivec_or(sh_proc_mask,ptr1,p_mask_size);
947: 
948:               /* is it to be done pairwise? */
949:               if (--ct1<=level)
950:                 {
951:                   npw++;
952: 
953:                   /* turn on high bit to indicate pw need to process */
954:                   *ptr2++ |= TOP_BIT;
955:                   ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
956:                   ptr1+=p_mask_size;
957:                   continue;
958:                 }

960:               /* get set for next and note that I have a tree contribution */
961:               /* could save exact elm index for tree here -> save a search */
962:               ptr2++; ptr1+=p_mask_size; ntree_map++;
963:             }
964:           /* i don't but still might be involved in tree */
965:           else
966:             {

968:               /* shared by how many? */
969: #ifdef SAFE
970:               ct1 = ct_bits((char *)ptr3,p_mask_size*INT_LEN);
971: #else
972:               ct1 = ct_bits((char *)ptr3,p_mask_size<<2);
973: #endif

975:               /* none! */
976:               if (ct1<2)
977:                 {continue;}

979:               /* is it going to be done pairwise? but not by me of course!*/
980:               if (--ct1<=level)
981:                 {continue;}
982:             }
983:           /* LATER we're going to have to process it NOW */
984:           /* nope ... tree it */
985:           place_in_tree(j);
986:         }
987:     }

989:   bss_free((void*)t_mask);
990:   bss_free((void*)buf1);

992:   gs->len_pw_list=npw;
993:   gs->num_nghs = ct_bits((char *)sh_proc_mask,p_mask_size*INT_LEN);

995:   /* expand from bit mask list to int list and save ngh list */
996:   gs->nghs = (int*) perm_malloc(gs->num_nghs * INT_LEN);
997:   bm_to_proc((char *)sh_proc_mask,p_mask_size*INT_LEN,gs->nghs);

999:   gs->num_pw_nghs = ct_bits((char *)pw_sh_proc_mask,p_mask_size*INT_LEN);

1001:   oper = GL_MAX;
1002:   ct1 = gs->num_nghs;
1003:   giop(&ct1,&ct2,1,&oper);
1004:   gs->max_nghs = ct1;

1006:   gs->tree_map_sz  = ntree_map;
1007:   gs->max_left_over=ntree;

1009:   bss_free((void*)p_mask);
1010:   bss_free((void*)sh_proc_mask);

1012: #ifdef DEBUG  
1013:   error_msg_warning("get_ngh_buf() end w/%d :: %d\n",my_id,num_nodes);
1014: #endif
1015: }





1021: /******************************************************************************
1022: Function: pairwise_init()

1024: Input : 
1025: Output: 
1026: Return: 
1027: Description: 

1029: if an element is shared by fewer that level# of nodes do pairwise exch 
1030: ******************************************************************************/
1031: static
1032: void
1033: set_pairwise(gs_id *gs)
1034: {
1035:   register int i, j;
1036:   int p_mask_size;
1037:   int *p_mask, *sh_proc_mask, *tmp_proc_mask;
1038:   int *ngh_buf, *buf2;
1039:   int offset;
1040:   int *msg_list, *msg_size, **msg_nodes, nprs;
1041:   int *pairwise_elm_list, len_pair_list=0;
1042:   int *iptr, t1, i_start, nel, *elms;
1043:   int ct;


1046: #ifdef DEBUG  
1047:   error_msg_warning("set_pairwise() begin w/%d :: %d\n",my_id,num_nodes);
1048: #endif

1050:   /* to make life easier */
1051:   nel  = gs->nel;
1052:   elms = gs->elms;
1053:   ngh_buf = gs->ngh_buf;
1054:   sh_proc_mask  = gs->pw_nghs;

1056:   /* need a few temp masks */
1057:   p_mask_size   = len_bit_mask(num_nodes);
1058:   p_mask        = (int*) bss_malloc(p_mask_size);
1059:   tmp_proc_mask = (int*) bss_malloc(p_mask_size);

1061:   /* set mask to my my_id's bit mask */
1062:   set_bit_mask(p_mask,p_mask_size,my_id);

1064: #ifdef SAFE
1065:   p_mask_size /= INT_LEN;
1066: #else
1067:   p_mask_size >>= 2;
1068: #endif
1069: 
1070:   len_pair_list=gs->len_pw_list;
1071:   gs->pw_elm_list=pairwise_elm_list=(int*)perm_malloc((len_pair_list+1)*INT_LEN);

1073:   /* how many processors (nghs) do we have to exchange with? */
1074:   nprs=gs->num_pairs=ct_bits((char *)sh_proc_mask,p_mask_size*INT_LEN);


1077:   /* allocate space for gs_gop() info */
1078:   gs->pair_list = msg_list = (int*)  perm_malloc(INT_LEN*nprs);
1079:   gs->msg_sizes = msg_size  = (int*)  perm_malloc(INT_LEN*nprs);
1080:   gs->node_list = msg_nodes = (int **) perm_malloc(INT_PTR_LEN*(nprs+1));

1082:   /* init msg_size list */
1083:   ivec_zero(msg_size,nprs);

1085:   /* expand from bit mask list to int list */
1086:   bm_to_proc((char *)sh_proc_mask,p_mask_size*INT_LEN,msg_list);
1087: 
1088:   /* keep list of elements being handled pairwise */
1089:   for (i=j=0;i<nel;i++)
1090:     {
1091:       if (elms[i] & TOP_BIT)
1092:         {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
1093:     }
1094:   pairwise_elm_list[j] = -1;

1096: #ifdef DEBUG
1097:   if (j!=len_pair_list)
1098:     {error_msg_fatal("oops ... bad paiwise list in set_pairwise!");}
1099: #endif

1101:   gs->msg_ids_out = (MPI_Request *)  perm_malloc(sizeof(MPI_Request)*(nprs+1));
1102:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
1103:   gs->msg_ids_in = (MPI_Request *)  perm_malloc(sizeof(MPI_Request)*(nprs+1));
1104:   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
1105:   gs->pw_vals = (REAL *) perm_malloc(REAL_LEN*len_pair_list*vec_sz);

1107:   /* find who goes to each processor */
1108:   for (i_start=i=0;i<nprs;i++)
1109:     {
1110:       /* processor i's mask */
1111:       set_bit_mask(p_mask,p_mask_size*INT_LEN,msg_list[i]);

1113:       /* det # going to processor i */
1114:       for (ct=j=0;j<len_pair_list;j++)
1115:         {
1116:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
1117:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
1118:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*INT_LEN))
1119:             {ct++;}
1120:         }
1121:       msg_size[i] = ct;
1122:       i_start = PetscMax(i_start,ct);

1124:       /*space to hold nodes in message to first neighbor */
1125:       msg_nodes[i] = iptr = (int*) perm_malloc(INT_LEN*(ct+1));

1127:       for (j=0;j<len_pair_list;j++)
1128:         {
1129:           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
1130:           ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
1131:           if (ct_bits((char *)tmp_proc_mask,p_mask_size*INT_LEN))
1132:             {*iptr++ = j;}
1133:         }
1134:       *iptr = -1;
1135:     }
1136:   msg_nodes[nprs] = NULL;

1138: #ifdef INFO1
1139:   t1 = GL_MAX;
1140:   giop(&i_start,&offset,1,&t1);
1141:   gs->max_pairs = i_start;
1142: #else
1143:   j=gs->loc_node_pairs=i_start;
1144:   t1 = GL_MAX;
1145:   giop(&i_start,&offset,1,&t1);
1146:   gs->max_node_pairs = i_start;

1148:   i_start=j;
1149:   t1 = GL_MIN;
1150:   giop(&i_start,&offset,1,&t1);
1151:   gs->min_node_pairs = i_start;

1153:   i_start=j;
1154:   t1 = GL_ADD;
1155:   giop(&i_start,&offset,1,&t1);
1156:   gs->avg_node_pairs = i_start/num_nodes + 1;

1158:   i_start=nprs;
1159:   t1 = GL_MAX;
1160:   giop(&i_start,&offset,1,&t1);
1161:   gs->max_pairs = i_start;

1163:   /*gs->max_pairs = -1;*/
1164: #endif

1166:   /* remap pairwise in tail of gsi_via_bit_mask() */
1167:   gs->msg_total = ivec_sum(gs->msg_sizes,nprs);
1168:   gs->out = (REAL *) perm_malloc(REAL_LEN*gs->msg_total*vec_sz);
1169:   gs->in  = (REAL *) perm_malloc(REAL_LEN*gs->msg_total*vec_sz);

1171:   /* reset malloc pool */
1172:   bss_free((void*)p_mask);
1173:   bss_free((void*)tmp_proc_mask);

1175: #ifdef DEBUG  
1176:   error_msg_warning("set_pairwise() end w/%d :: %d\n",my_id,num_nodes);
1177: #endif
1178: }



1182: /******************************************************************************
1183: Function: set_tree()

1185: Input : 
1186: Output: 
1187: Return: 
1188: Description: 

1190: to do pruned tree just save ngh buf copy for each one and decode here!
1191: ******************************************************************************/
1192: static
1193: void
1194: set_tree(gs_id *gs)
1195: {
1196:   register int i, j, n, nel;
1197:   register int *iptr_in, *iptr_out, *tree_elms, *elms;


1200: #ifdef DEBUG
1201:   error_msg_warning("set_tree() :: begin\n");
1202: #endif

1204:   /* local work ptrs */
1205:   elms = gs->elms;
1206:   nel     = gs->nel;

1208:   /* how many via tree */
1209:   gs->tree_nel  = n = ntree;
1210:   gs->tree_elms = tree_elms = iptr_in = tree_buf;
1211:   gs->tree_buf  = (REAL *) bss_malloc(REAL_LEN*n*vec_sz);
1212:   gs->tree_work = (REAL *) bss_malloc(REAL_LEN*n*vec_sz);
1213:   j=gs->tree_map_sz;
1214:   gs->tree_map_in = iptr_in  = (int*) bss_malloc(INT_LEN*(j+1));
1215:   gs->tree_map_out = iptr_out = (int*) bss_malloc(INT_LEN*(j+1));

1217: #ifdef DEBUG  
1218:   error_msg_warning("num on tree=%d,%d",gs->max_left_over,gs->tree_nel);
1219: #endif

1221:   /* search the longer of the two lists */
1222:   /* note ... could save this info in get_ngh_buf and save searches */
1223:   if (n<=nel)
1224:     {
1225:       /* bijective fct w/remap - search elm list */
1226:       for (i=0; i<n; i++)
1227:         {
1228:           if ((j=ivec_binary_search(*tree_elms++,elms,nel))>=0)
1229:             {*iptr_in++ = j; *iptr_out++ = i;}
1230:         }
1231:     }
1232:   else
1233:     {
1234:       for (i=0; i<nel; i++)
1235:         {
1236:           if ((j=ivec_binary_search(*elms++,tree_elms,n))>=0)
1237:             {*iptr_in++ = i; *iptr_out++ = j;}
1238:         }
1239:     }

1241:   /* sentinel */
1242:   *iptr_in = *iptr_out = -1;

1244: #ifdef DEBUG
1245:   error_msg_warning("set_tree() :: end\n");
1246: #endif
1247: }


1250: /******************************************************************************
1251: Function: gsi_via_int_list()

1253: Input : 
1254: Output: 
1255: Return: 
1256: Description: 
1257: ******************************************************************************/
1258: /*static
1259: void
1260: gsi_via_int_list(gs_id *gs)
1261: {

1263:    LATER: for P large the bit masks -> too many passes
1264:    LATER: strategy: do gsum w/1 in position i in negl if owner
1265:    LATER: then sum of entire vector 1 ... negl determines min buf len
1266:    LATER: So choose min from this or mask method
1267: }*/


1270: #if defined(not_used)
1271: static
1272: int
1273: root_sub_tree(int *proc_list, int num)
1274: {
1275:   register int i, j, p_or, p_and;
1276:   register int root, mask;


1279:   /* ceiling(log2(num_nodes)) - 1 */
1280:   j = i_log2_num_nodes;
1281:   if (num_nodes==floor_num_nodes)
1282:     {j--;}

1284:   /* set mask to msb */
1285:   for(mask=1,i=0; i<j; i++)
1286:     {mask<<=1;}

1288:   p_or  = ivec_reduce_or(proc_list,num);
1289:   p_and = ivec_reduce_and(proc_list,num);
1290:   for(root=i=0; i<j; i++,mask>>=1)
1291:     {
1292:       /* (msb-i)'th bits on ==> root in right 1/2 tree */
1293:       if (mask & p_and)
1294:         {root |= mask;}

1296:       /* (msb-i)'th bits differ ==> root found */
1297:       else if (mask & p_or)
1298:         {break;}

1300:       /* (msb-i)'th bits off ==>root in left 1/2 tree */
1301:     }

1303: #ifdef DEBUG
1304:   if ((root<0) || (root>num_nodes))
1305:     {error_msg_fatal("root_sub_tree() :: bad root!");}

1307:   if (!my_id)
1308:     {
1309:       printf("num_nodes=%d, j=%d, root=%d\n",num_nodes,j,root);
1310:       printf("procs: ");
1311:       for(i=0;i<num;i++)
1312:         {printf("%d ",proc_list[i]);}
1313:       printf("\n");
1314:     }
1315: #endif

1317:   return(root);
1318: }
1319: #endif


1322: #if defined(not_used)
1323: static int
1324: in_sub_tree(int *mask, int mask_size, int *work, int nw)
1325: {
1326:   int ct, nb;
1327: 
1328:   /* mask size in bytes */
1329:   nb = mask_size<<2;
1330: 
1331:   /* shared amoungst how many? */
1332:   ct = ct_bits((char *)mask,nb);

1334:   /* enough space? */
1335:   if (nw<ct)
1336:     {error_msg_fatal("in_sub_tree() :: not enough space to expand bit mask!");}

1338:   /* expand */
1339:   bm_to_proc((char *)mask,nb,work);
1340: 
1341:   /* find tree root */
1342:   root_sub_tree(work,ct);

1344:   /* am i in any of the paths? */

1346:   return(TRUE);

1348:   /*
1349:   sh_mask = (int*)bss_malloc(nb);
1350:   bss_free(sh_mask);
1351:   */
1352: }
1353: #endif


1356: /******************************************************************************
1357: Function: gather_scatter

1359: Input : 
1360: Output: 
1361: Return: 
1362: Description: 
1363: ******************************************************************************/
1364: static
1365: void
1366: gs_gop_local_out(register gs_id *gs, register REAL *vals)
1367: {
1368:   register int *num, *map, **reduce;
1369:   register REAL tmp;


1372: #ifdef DEBUG
1373:   error_msg_warning("start gs_gop_xxx()\n");
1374: #endif

1376:   num    = gs->num_gop_local_reduce;
1377:   reduce = gs->gop_local_reduce;
1378:   while ((map = *reduce++))
1379:     {
1380:       /* wall */
1381:       if (*num == 2)
1382:         {
1383:           num ++;
1384:           vals[map[1]] = vals[map[0]];
1385:         }
1386:       /* corner shared by three elements */
1387:       else if (*num == 3)
1388:         {
1389:           num ++;
1390:           vals[map[2]] = vals[map[1]] = vals[map[0]];
1391:         }
1392:       /* corner shared by four elements */
1393:       else if (*num == 4)
1394:         {
1395:           num ++;
1396:           vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
1397:         }
1398:       /* general case ... odd geoms ... 3D*/
1399:       else
1400:         {
1401:           num++;
1402:           tmp = *(vals + *map++);
1403:           while (*map >= 0)
1404:             {*(vals + *map++) = tmp;}
1405:         }
1406:     }
1407: }



1411: /******************************************************************************
1412: Function: gather_scatter

1414: Input : 
1415: Output: 
1416: Return: 
1417: Description: 
1418: ******************************************************************************/
1419: void
1420: gs_gop_binary(gs_ADT gs, REAL *vals, rbfp fct)
1421: {
1422: #ifdef DEBUG  
1423:   if (!gs)  {error_msg_fatal("gs_gop() :: passed NULL gs handle!!!");}
1424:   if (!fct) {error_msg_fatal("gs_gop() :: passed NULL bin fct handle!!!");}
1425:   error_msg_warning("start gs_gop_xxx()\n");
1426: #endif

1428:   /* local only operations!!! */
1429:   if (gs->num_local)
1430:     {gs_gop_local_binary(gs,vals,fct);}
1431: 
1432:   /* if intersection tree/pairwise and local isn't empty */
1433:   if (gs->num_local_gop)
1434:     {
1435:       gs_gop_local_in_binary(gs,vals,fct);
1436: 
1437:       /* pairwise */
1438:       if (gs->num_pairs)
1439:         {gs_gop_pairwise_binary(gs,vals,fct);}
1440: 
1441:       /* tree */
1442:       else if (gs->max_left_over)
1443:         {gs_gop_tree_binary(gs,vals,fct);}
1444: 
1445:       gs_gop_local_out(gs,vals);
1446:     }
1447:   /* if intersection tree/pairwise and local is empty */
1448:   else
1449:     {
1450:       /* pairwise */
1451:       if (gs->num_pairs)
1452:         {gs_gop_pairwise_binary(gs,vals,fct);}
1453: 
1454:       /* tree */
1455:       else if (gs->max_left_over)
1456:         {gs_gop_tree_binary(gs,vals,fct);}
1457:     }
1458: }



1462: /******************************************************************************
1463: Function: gather_scatter

1465: Input : 
1466: Output: 
1467: Return: 
1468: Description: 
1469: ******************************************************************************/
1470: static
1471: void
1472: gs_gop_local_binary(register gs_id *gs, register REAL *vals, register rbfp fct)
1473: {
1474:   register int *num, *map, **reduce;
1475:   REAL tmp;


1478: #ifdef DEBUG
1479:   error_msg_warning("start gs_gop_xxx()\n");
1480: #endif
1481:   num    = gs->num_local_reduce;
1482:   reduce = gs->local_reduce;
1483:   while ((map = *reduce))
1484:     {
1485:       num ++;
1486:       (*fct)(&tmp,NULL,1);
1487:       /* tmp = 0.0; */
1488:       while (*map >= 0)
1489:         {(*fct)(&tmp,(vals + *map),1); map++;}
1490:         /*        {tmp = (*fct)(tmp,*(vals + *map)); map++;} */
1491: 
1492:       map = *reduce++;
1493:       while (*map >= 0)
1494:         {*(vals + *map++) = tmp;}
1495:     }
1496: }



1500: /******************************************************************************
1501: Function: gather_scatter

1503: Input : 
1504: Output: 
1505: Return: 
1506: Description: 
1507: ******************************************************************************/
1508: static
1509: void
1510: gs_gop_local_in_binary(register gs_id *gs, register REAL *vals, register rbfp fct)
1511: {
1512:   register int *num, *map, **reduce;
1513:   register REAL *base;


1516: #ifdef DEBUG
1517:   error_msg_warning("start gs_gop_xxx()\n");
1518: #endif

1520:   num    = gs->num_gop_local_reduce;

1522:   reduce = gs->gop_local_reduce;
1523:   while ((map = *reduce++))
1524:     {
1525:       num++;
1526:       base = vals + *map++;
1527:       while (*map >= 0)
1528:         {(*fct)(base,(vals + *map),1); map++;}
1529:         /*        {*base = (*fct)(*base,*(vals + *map)); map++;} */
1530:     }
1531: }



1535: /******************************************************************************
1536: Function: gather_scatter

1538: VERSION 3 :: 

1540: Input : 
1541: Output: 
1542: Return: 
1543: Description: 
1544: ******************************************************************************/
1545: static
1546: void
1547: gs_gop_pairwise_binary(register gs_id *gs, register REAL *in_vals,
1548:                        register rbfp fct)
1549: {
1550:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
1551:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
1552:   register int *pw, *list, *size, **nodes;
1553:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1554:   MPI_Status status;


1557:   /* strip and load registers */
1558:   msg_list =list         = gs->pair_list;
1559:   msg_size =size         = gs->msg_sizes;
1560:   msg_nodes=nodes        = gs->node_list;
1561:   iptr=pw                = gs->pw_elm_list;
1562:   dptr1=dptr3            = gs->pw_vals;
1563:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1564:   msg_ids_out = ids_out  = gs->msg_ids_out;
1565:   dptr2                  = gs->out;
1566:   in1=in2                = gs->in;

1568:   /* post the receives */
1569:   /*  msg_nodes=nodes; */
1570:   do
1571:     {
1572:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1573:          second one *list and do list++ afterwards */
1574:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
1575:                 gs->gs_comm, msg_ids_in++);
1576:       in1 += *size++;
1577:     }
1578:   while (*++msg_nodes);
1579:   msg_nodes=nodes;

1581:   /* load gs values into in out gs buffers */
1582:   while (*iptr >= 0)
1583:     {*dptr3++ = *(in_vals + *iptr++);}

1585:   /* load out buffers and post the sends */
1586:   while ((iptr = *msg_nodes++))
1587:     {
1588:       dptr3 = dptr2;
1589:       while (*iptr >= 0)
1590:         {*dptr2++ = *(dptr1 + *iptr++);}
1591:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1592:       /* is msg_ids_out++ correct? */
1593:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
1594:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
1595:     }

1597:   if (gs->max_left_over)
1598:     {gs_gop_tree_binary(gs,in_vals,fct);}

1600:   /* process the received data */
1601:   msg_nodes=nodes;
1602:   while ((iptr = *nodes++))
1603:     {
1604:       /* Should I check the return value of MPI_Wait() or status? */
1605:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1606:       MPI_Wait(ids_in++, &status);
1607:       while (*iptr >= 0)
1608:         {(*fct)((dptr1 + *iptr),in2,1); iptr++; in2++;}
1609:       /* {*(dptr1 + *iptr) = (*fct)(*(dptr1 + *iptr),*in2); iptr++; in2++;} */
1610:     }

1612:   /* replace vals */
1613:   while (*pw >= 0)
1614:     {*(in_vals + *pw++) = *dptr1++;}

1616:   /* clear isend message handles */
1617:   /* This changed for clarity though it could be the same */
1618:   while (*msg_nodes++)
1619:     /* Should I check the return value of MPI_Wait() or status? */
1620:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1621:     {MPI_Wait(ids_out++, &status);}
1622: }



1626: /******************************************************************************
1627: Function: gather_scatter

1629: Input : 
1630: Output: 
1631: Return: 
1632: Description: 
1633: ******************************************************************************/
1634: static
1635: void
1636: gs_gop_tree_binary(gs_id *gs, REAL *vals, register rbfp fct)
1637: {
1638:   int size;
1639:   int *in, *out;
1640:   REAL *buf, *work;

1642: #ifdef DEBUG
1643:   error_msg_warning("gs_gop_tree_binary() :: start\n");
1644: #endif
1645: 
1646:   in   = gs->tree_map_in;
1647:   out  = gs->tree_map_out;
1648:   buf  = gs->tree_buf;
1649:   work = gs->tree_work;
1650:   size = gs->tree_nel;

1652:   /* load vals vector w/identity */
1653:   (*fct)(buf,NULL,size);
1654: 
1655:   /* load my contribution into val vector */
1656:   while (*in >= 0)
1657:     {(*fct)((buf + *out++),(vals + *in++),-1);}
1658: /*    {*(buf + *out++) = *(vals + *in++);} */

1660:   gfop(buf,work,size,(vbfp)fct,REAL_TYPE,0);

1662:   in   = gs->tree_map_in;
1663:   out  = gs->tree_map_out;
1664:   while (*in >= 0)
1665:     {(*fct)((vals + *in++),(buf + *out++),-1);}
1666:     /*    {*(vals + *in++) = *(buf + *out++);} */


1669: #ifdef DEBUG
1670:   error_msg_warning("gs_gop_tree_binary() :: end\n");
1671: #endif

1673: }




1678: /******************************************************************************
1679: Function: gather_scatter

1681: Input : 
1682: Output: 
1683: Return: 
1684: Description: 
1685: ******************************************************************************/
1686: void
1687: gs_gop(register gs_id *gs, register REAL *vals, register const char *op)
1688: {
1689: #ifdef DEBUG
1690:   error_msg_warning("start gs_gop()\n");
1691:   if (!gs) {error_msg_fatal("gs_gop() :: passed NULL gs handle!!!");}
1692:   if (!op) {error_msg_fatal("gs_gop() :: passed NULL operation!!!");}
1693: #endif

1695:   switch (*op) {
1696:   case '+':
1697:     gs_gop_plus(gs,vals);
1698:     break;
1699:   case '*':
1700:     gs_gop_times(gs,vals);
1701:     break;
1702:   case 'a':
1703:     gs_gop_min_abs(gs,vals);
1704:     break;
1705:   case 'A':
1706:     gs_gop_max_abs(gs,vals);
1707:     break;
1708:   case 'e':
1709:     gs_gop_exists(gs,vals);
1710:     break;
1711:   case 'm':
1712:     gs_gop_min(gs,vals);
1713:     break;
1714:   case 'M':
1715:     gs_gop_max(gs,vals); break;
1716:     /*
1717:     if (*(op+1)=='\0')
1718:       {gs_gop_max(gs,vals); break;}
1719:     else if (*(op+1)=='X')
1720:       {gs_gop_max_abs(gs,vals); break;}
1721:     else if (*(op+1)=='N')
1722:       {gs_gop_min_abs(gs,vals); break;}
1723:     */
1724:   default:
1725:     error_msg_warning("gs_gop() :: %c is not a valid op",op[0]);
1726:     error_msg_warning("gs_gop() :: default :: plus");
1727:     gs_gop_plus(gs,vals);
1728:     break;
1729:   }
1730: #ifdef DEBUG
1731:   error_msg_warning("end gs_gop()\n");
1732: #endif
1733: }


1736: /******************************************************************************
1737: Function: gather_scatter

1739: Input : 
1740: Output: 
1741: Return: 
1742: Description: 
1743: ******************************************************************************/
1744: static void
1745: gs_gop_exists(register gs_id *gs, register REAL *vals)
1746: {
1747: #ifdef DEBUG
1748:   error_msg_warning("start gs_gop_xxx()\n");
1749: #endif

1751:   /* local only operations!!! */
1752:   if (gs->num_local)
1753:     {gs_gop_local_exists(gs,vals);}

1755:   /* if intersection tree/pairwise and local isn't empty */
1756:   if (gs->num_local_gop)
1757:     {
1758:       gs_gop_local_in_exists(gs,vals);

1760:       /* pairwise */
1761:       if (gs->num_pairs)
1762:         {gs_gop_pairwise_exists(gs,vals);}
1763: 
1764:       /* tree */
1765:       else if (gs->max_left_over)
1766:         {gs_gop_tree_exists(gs,vals);}
1767: 
1768:       gs_gop_local_out(gs,vals);
1769:     }
1770:   /* if intersection tree/pairwise and local is empty */
1771:   else
1772:     {
1773:       /* pairwise */
1774:       if (gs->num_pairs)
1775:         {gs_gop_pairwise_exists(gs,vals);}
1776: 
1777:       /* tree */
1778:       else if (gs->max_left_over)
1779:         {gs_gop_tree_exists(gs,vals);}
1780:     }
1781: }



1785: /******************************************************************************
1786: Function: gather_scatter

1788: Input : 
1789: Output: 
1790: Return: 
1791: Description: 
1792: ******************************************************************************/
1793: static
1794: void
1795: gs_gop_local_exists(register gs_id *gs, register REAL *vals)
1796: {
1797:   register int *num, *map, **reduce;
1798:   register REAL tmp;


1801: #ifdef DEBUG
1802:   error_msg_warning("start gs_gop_xxx()\n");
1803: #endif

1805:   num    = gs->num_local_reduce;
1806:   reduce = gs->local_reduce;
1807:   while ((map = *reduce))
1808:     {
1809:       num ++;
1810:       tmp = 0.0;
1811:       while (*map >= 0)
1812:         {tmp = EXISTS(tmp,*(vals + *map)); map++;}
1813: 
1814:       map = *reduce++;
1815:       while (*map >= 0)
1816:         {*(vals + *map++) = tmp;}
1817:     }
1818: }



1822: /******************************************************************************
1823: Function: gather_scatter

1825: Input : 
1826: Output: 
1827: Return: 
1828: Description: 
1829: ******************************************************************************/
1830: static
1831: void
1832: gs_gop_local_in_exists(register gs_id *gs, register REAL *vals)
1833: {
1834:   register int *num, *map, **reduce;
1835:   register REAL *base;


1838: #ifdef DEBUG
1839:   error_msg_warning("start gs_gop_xxx()\n");
1840: #endif

1842:   num    = gs->num_gop_local_reduce;
1843:   reduce = gs->gop_local_reduce;
1844:   while ((map = *reduce++))
1845:     {
1846:       num++;
1847:       base = vals + *map++;
1848:       while (*map >= 0)
1849:         {*base = EXISTS(*base,*(vals + *map)); map++;}
1850:     }
1851: }



1855: /******************************************************************************
1856: Function: gather_scatter

1858: VERSION 3 :: 

1860: Input : 
1861: Output: 
1862: Return: 
1863: Description: 
1864: ******************************************************************************/
1865: static
1866: void
1867: gs_gop_pairwise_exists(register gs_id *gs, register REAL *in_vals)
1868: {
1869:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
1870:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
1871:   register int *pw, *list, *size, **nodes;
1872:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1873:   MPI_Status status;


1876:   /* strip and load registers */
1877:   msg_list =list         = gs->pair_list;
1878:   msg_size =size         = gs->msg_sizes;
1879:   msg_nodes=nodes        = gs->node_list;
1880:   iptr=pw                = gs->pw_elm_list;
1881:   dptr1=dptr3            = gs->pw_vals;
1882:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1883:   msg_ids_out = ids_out  = gs->msg_ids_out;
1884:   dptr2                  = gs->out;
1885:   in1=in2                = gs->in;

1887:   /* post the receives */
1888:   /*  msg_nodes=nodes; */
1889:   do
1890:     {
1891:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1892:          second one *list and do list++ afterwards */
1893:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
1894:                 gs->gs_comm, msg_ids_in++);
1895:       in1 += *size++;
1896:     }
1897:   while (*++msg_nodes);
1898:   msg_nodes=nodes;

1900:   /* load gs values into in out gs buffers */
1901:   while (*iptr >= 0)
1902:     {*dptr3++ = *(in_vals + *iptr++);}

1904:   /* load out buffers and post the sends */
1905:   while ((iptr = *msg_nodes++))
1906:     {
1907:       dptr3 = dptr2;
1908:       while (*iptr >= 0)
1909:         {*dptr2++ = *(dptr1 + *iptr++);}
1910:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1911:       /* is msg_ids_out++ correct? */
1912:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
1913:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
1914:     }

1916:   if (gs->max_left_over)
1917:     {gs_gop_tree_exists(gs,in_vals);}

1919:   /* process the received data */
1920:   msg_nodes=nodes;
1921:   while ((iptr = *nodes++))
1922:     {
1923:       /* Should I check the return value of MPI_Wait() or status? */
1924:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1925:       MPI_Wait(ids_in++, &status);
1926:       while (*iptr >= 0)
1927:         {*(dptr1 + *iptr) = EXISTS(*(dptr1 + *iptr),*in2); iptr++; in2++;}
1928:     }

1930:   /* replace vals */
1931:   while (*pw >= 0)
1932:     {*(in_vals + *pw++) = *dptr1++;}

1934:   /* clear isend message handles */
1935:   /* This changed for clarity though it could be the same */
1936:   while (*msg_nodes++)
1937:     /* Should I check the return value of MPI_Wait() or status? */
1938:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1939:     {MPI_Wait(ids_out++, &status);}
1940: }



1944: /******************************************************************************
1945: Function: gather_scatter

1947: Input : 
1948: Output: 
1949: Return: 
1950: Description: 
1951: ******************************************************************************/
1952: static
1953: void
1954: gs_gop_tree_exists(gs_id *gs, REAL *vals)
1955: {
1956:   int size;
1957:   int *in, *out;
1958:   REAL *buf, *work;
1959:   int op[] = {GL_EXISTS,0};


1962: #ifdef DEBUG
1963:   error_msg_warning("start gs_gop_tree_exists()");
1964: #endif
1965: 
1966:   in   = gs->tree_map_in;
1967:   out  = gs->tree_map_out;
1968:   buf  = gs->tree_buf;
1969:   work = gs->tree_work;
1970:   size = gs->tree_nel;

1972: #if defined  BLAS||CBLAS
1973:   *work = 0.0;
1974:   copy(size,work,0,buf,1);
1975: #else
1976:   rvec_zero(buf,size);
1977: #endif

1979:   while (*in >= 0)
1980:     {
1981:       /*
1982:       printf("%d :: out=%d\n",my_id,*out);
1983:       printf("%d :: in=%d\n",my_id,*in);
1984:       */
1985:       *(buf + *out++) = *(vals + *in++);
1986:     }

1988:   grop(buf,work,size,op);

1990:   in   = gs->tree_map_in;
1991:   out  = gs->tree_map_out;

1993:   while (*in >= 0)
1994:     {*(vals + *in++) = *(buf + *out++);}

1996: #ifdef DEBUG
1997:   error_msg_warning("start gs_gop_tree_exists()");
1998: #endif
1999: }



2003: /******************************************************************************
2004: Function: gather_scatter

2006: Input : 
2007: Output: 
2008: Return: 
2009: Description: 
2010: ******************************************************************************/
2011: static void
2012: gs_gop_max_abs(register gs_id *gs, register REAL *vals)
2013: {
2014: #ifdef DEBUG
2015:   error_msg_warning("start gs_gop_xxx()\n");
2016: #endif

2018:   /* local only operations!!! */
2019:   if (gs->num_local)
2020:     {gs_gop_local_max_abs(gs,vals);}

2022:   /* if intersection tree/pairwise and local isn't empty */
2023:   if (gs->num_local_gop)
2024:     {
2025:       gs_gop_local_in_max_abs(gs,vals);

2027:       /* pairwise */
2028:       if (gs->num_pairs)
2029:         {gs_gop_pairwise_max_abs(gs,vals);}
2030: 
2031:       /* tree */
2032:       else if (gs->max_left_over)
2033:         {gs_gop_tree_max_abs(gs,vals);}
2034: 
2035:       gs_gop_local_out(gs,vals);
2036:     }
2037:   /* if intersection tree/pairwise and local is empty */
2038:   else
2039:     {
2040:       /* pairwise */
2041:       if (gs->num_pairs)
2042:         {gs_gop_pairwise_max_abs(gs,vals);}
2043: 
2044:       /* tree */
2045:       else if (gs->max_left_over)
2046:         {gs_gop_tree_max_abs(gs,vals);}
2047:     }
2048: }



2052: /******************************************************************************
2053: Function: gather_scatter

2055: Input : 
2056: Output: 
2057: Return: 
2058: Description: 
2059: ******************************************************************************/
2060: static
2061: void
2062: gs_gop_local_max_abs(register gs_id *gs, register REAL *vals)
2063: {
2064:   register int *num, *map, **reduce;
2065:   register REAL tmp;


2068: #ifdef DEBUG
2069:   error_msg_warning("start gs_gop_xxx()\n");
2070: #endif

2072:   num    = gs->num_local_reduce;
2073:   reduce = gs->local_reduce;
2074:   while ((map = *reduce))
2075:     {
2076:       num ++;
2077:       tmp = 0.0;
2078:       while (*map >= 0)
2079:         {tmp = MAX_FABS(tmp,*(vals + *map)); map++;}
2080: 
2081:       map = *reduce++;
2082:       while (*map >= 0)
2083:         {*(vals + *map++) = tmp;}
2084:     }
2085: }



2089: /******************************************************************************
2090: Function: gather_scatter

2092: Input : 
2093: Output: 
2094: Return: 
2095: Description: 
2096: ******************************************************************************/
2097: static
2098: void
2099: gs_gop_local_in_max_abs(register gs_id *gs, register REAL *vals)
2100: {
2101:   register int *num, *map, **reduce;
2102:   register REAL *base;


2105: #ifdef DEBUG
2106:   error_msg_warning("start gs_gop_xxx()\n");
2107: #endif

2109:   num    = gs->num_gop_local_reduce;
2110:   reduce = gs->gop_local_reduce;
2111:   while ((map = *reduce++))
2112:     {
2113:       num++;
2114:       base = vals + *map++;
2115:       while (*map >= 0)
2116:         {*base = MAX_FABS(*base,*(vals + *map)); map++;}
2117:     }
2118: }



2122: /******************************************************************************
2123: Function: gather_scatter

2125: VERSION 3 :: 

2127: Input : 
2128: Output: 
2129: Return: 
2130: Description: 
2131: ******************************************************************************/
2132: static
2133: void
2134: gs_gop_pairwise_max_abs(register gs_id *gs, register REAL *in_vals)
2135: {
2136:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
2137:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
2138:   register int *pw, *list, *size, **nodes;
2139:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
2140:   MPI_Status status;


2143:   /* strip and load registers */
2144:   msg_list =list         = gs->pair_list;
2145:   msg_size =size         = gs->msg_sizes;
2146:   msg_nodes=nodes        = gs->node_list;
2147:   iptr=pw                = gs->pw_elm_list;
2148:   dptr1=dptr3            = gs->pw_vals;
2149:   msg_ids_in  = ids_in   = gs->msg_ids_in;
2150:   msg_ids_out = ids_out  = gs->msg_ids_out;
2151:   dptr2                  = gs->out;
2152:   in1=in2                = gs->in;

2154:   /* post the receives */
2155:   /*  msg_nodes=nodes; */
2156:   do
2157:     {
2158:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
2159:          second one *list and do list++ afterwards */
2160:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
2161:                 gs->gs_comm, msg_ids_in++);
2162:       in1 += *size++;
2163:     }
2164:   while (*++msg_nodes);
2165:   msg_nodes=nodes;

2167:   /* load gs values into in out gs buffers */
2168:   while (*iptr >= 0)
2169:     {*dptr3++ = *(in_vals + *iptr++);}

2171:   /* load out buffers and post the sends */
2172:   while ((iptr = *msg_nodes++))
2173:     {
2174:       dptr3 = dptr2;
2175:       while (*iptr >= 0)
2176:         {*dptr2++ = *(dptr1 + *iptr++);}
2177:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
2178:       /* is msg_ids_out++ correct? */
2179:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
2180:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
2181:     }

2183:   if (gs->max_left_over)
2184:     {gs_gop_tree_max_abs(gs,in_vals);}

2186:   /* process the received data */
2187:   msg_nodes=nodes;
2188:   while ((iptr = *nodes++))
2189:     {
2190:       /* Should I check the return value of MPI_Wait() or status? */
2191:       /* Can this loop be replaced by a call to MPI_Waitall()? */
2192:       MPI_Wait(ids_in++, &status);
2193:       while (*iptr >= 0)
2194:         {*(dptr1 + *iptr) = MAX_FABS(*(dptr1 + *iptr),*in2); iptr++; in2++;}
2195:     }

2197:   /* replace vals */
2198:   while (*pw >= 0)
2199:     {*(in_vals + *pw++) = *dptr1++;}

2201:   /* clear isend message handles */
2202:   /* This changed for clarity though it could be the same */
2203:   while (*msg_nodes++)
2204:     /* Should I check the return value of MPI_Wait() or status? */
2205:     /* Can this loop be replaced by a call to MPI_Waitall()? */
2206:     {MPI_Wait(ids_out++, &status);}
2207: }



2211: /******************************************************************************
2212: Function: gather_scatter

2214: Input : 
2215: Output: 
2216: Return: 
2217: Description: 
2218: ******************************************************************************/
2219: static
2220: void
2221: gs_gop_tree_max_abs(gs_id *gs, REAL *vals)
2222: {
2223:   int size;
2224:   int *in, *out;
2225:   REAL *buf, *work;
2226:   int op[] = {GL_MAX_ABS,0};


2229: #ifdef DEBUG
2230:   error_msg_warning("start gs_gop_tree_max_abs()");
2231: #endif
2232: 
2233:   in   = gs->tree_map_in;
2234:   out  = gs->tree_map_out;
2235:   buf  = gs->tree_buf;
2236:   work = gs->tree_work;
2237:   size = gs->tree_nel;

2239: #if defined BLAS||CBLAS
2240:   *work = 0.0;
2241:   copy(size,work,0,buf,1);
2242: #else
2243:   rvec_zero(buf,size);
2244: #endif

2246:   while (*in >= 0)
2247:     {
2248:       /*
2249:       printf("%d :: out=%d\n",my_id,*out);
2250:       printf("%d :: in=%d\n",my_id,*in);
2251:       */
2252:       *(buf + *out++) = *(vals + *in++);
2253:     }

2255:   grop(buf,work,size,op);

2257:   in   = gs->tree_map_in;
2258:   out  = gs->tree_map_out;

2260:   while (*in >= 0)
2261:     {*(vals + *in++) = *(buf + *out++);}

2263: #ifdef DEBUG
2264:   error_msg_warning("start gs_gop_tree_max_abs()");
2265: #endif
2266: }



2270: /******************************************************************************
2271: Function: gather_scatter

2273: Input : 
2274: Output: 
2275: Return: 
2276: Description: 
2277: ******************************************************************************/
2278: static void
2279: gs_gop_max(register gs_id *gs, register REAL *vals)
2280: {
2281: #ifdef DEBUG
2282:   error_msg_warning("start gs_gop_xxx()\n");
2283: #endif


2286:   /* local only operations!!! */
2287:   if (gs->num_local)
2288:     {gs_gop_local_max(gs,vals);}

2290:   /* if intersection tree/pairwise and local isn't empty */
2291:   if (gs->num_local_gop)
2292:     {
2293:       gs_gop_local_in_max(gs,vals);

2295:       /* pairwise */
2296:       if (gs->num_pairs)
2297:         {gs_gop_pairwise_max(gs,vals);}
2298: 
2299:       /* tree */
2300:       else if (gs->max_left_over)
2301:         {gs_gop_tree_max(gs,vals);}
2302: 
2303:       gs_gop_local_out(gs,vals);
2304:     }
2305:   /* if intersection tree/pairwise and local is empty */
2306:   else
2307:     {
2308:       /* pairwise */
2309:       if (gs->num_pairs)
2310:         {gs_gop_pairwise_max(gs,vals);}
2311: 
2312:       /* tree */
2313:       else if (gs->max_left_over)
2314:         {gs_gop_tree_max(gs,vals);}
2315:     }
2316: }



2320: /******************************************************************************
2321: Function: gather_scatter

2323: Input : 
2324: Output: 
2325: Return: 
2326: Description: 
2327: ******************************************************************************/
2328: static
2329: void
2330: gs_gop_local_max(register gs_id *gs, register REAL *vals)
2331: {
2332:   register int *num, *map, **reduce;
2333:   register REAL tmp;


2336: #ifdef DEBUG
2337:   error_msg_warning("start gs_gop_xxx()\n");
2338: #endif

2340:   num    = gs->num_local_reduce;
2341:   reduce = gs->local_reduce;
2342:   while ((map = *reduce))
2343:     {
2344:       num ++;
2345:       tmp = -REAL_MAX;
2346:       while (*map >= 0)
2347:         {tmp = PetscMax(tmp,*(vals + *map)); map++;}
2348: 
2349:       map = *reduce++;
2350:       while (*map >= 0)
2351:         {*(vals + *map++) = tmp;}
2352:     }
2353: }



2357: /******************************************************************************
2358: Function: gather_scatter

2360: Input : 
2361: Output: 
2362: Return: 
2363: Description: 
2364: ******************************************************************************/
2365: static
2366: void
2367: gs_gop_local_in_max(register gs_id *gs, register REAL *vals)
2368: {
2369:   register int *num, *map, **reduce;
2370:   register REAL *base;


2373: #ifdef DEBUG
2374:   error_msg_warning("start gs_gop_xxx()\n");
2375: #endif

2377:   num    = gs->num_gop_local_reduce;
2378:   reduce = gs->gop_local_reduce;
2379:   while ((map = *reduce++))
2380:     {
2381:       num++;
2382:       base = vals + *map++;
2383:       while (*map >= 0)
2384:         {*base = PetscMax(*base,*(vals + *map)); map++;}
2385:     }
2386: }



2390: /******************************************************************************
2391: Function: gather_scatter

2393: VERSION 3 :: 

2395: Input : 
2396: Output: 
2397: Return: 
2398: Description: 
2399: ******************************************************************************/
2400: static
2401: void
2402: gs_gop_pairwise_max(register gs_id *gs, register REAL *in_vals)
2403: {
2404:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
2405:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
2406:   register int *pw, *list, *size, **nodes;
2407:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
2408:   MPI_Status status;


2411:   /* strip and load registers */
2412:   msg_list =list         = gs->pair_list;
2413:   msg_size =size         = gs->msg_sizes;
2414:   msg_nodes=nodes        = gs->node_list;
2415:   iptr=pw                = gs->pw_elm_list;
2416:   dptr1=dptr3            = gs->pw_vals;
2417:   msg_ids_in  = ids_in   = gs->msg_ids_in;
2418:   msg_ids_out = ids_out  = gs->msg_ids_out;
2419:   dptr2                  = gs->out;
2420:   in1=in2                = gs->in;

2422:   /* post the receives */
2423:   /*  msg_nodes=nodes; */
2424:   do
2425:     {
2426:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
2427:          second one *list and do list++ afterwards */
2428:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
2429:                 gs->gs_comm, msg_ids_in++);
2430:       in1 += *size++;
2431:     }
2432:   while (*++msg_nodes);
2433:   msg_nodes=nodes;

2435:   /* load gs values into in out gs buffers */
2436:   while (*iptr >= 0)
2437:     {*dptr3++ = *(in_vals + *iptr++);}

2439:   /* load out buffers and post the sends */
2440:   while ((iptr = *msg_nodes++))
2441:     {
2442:       dptr3 = dptr2;
2443:       while (*iptr >= 0)
2444:         {*dptr2++ = *(dptr1 + *iptr++);}
2445:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
2446:       /* is msg_ids_out++ correct? */
2447:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
2448:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
2449:     }

2451:   if (gs->max_left_over)
2452:     {gs_gop_tree_max(gs,in_vals);}

2454:   /* process the received data */
2455:   msg_nodes=nodes;
2456:   while ((iptr = *nodes++))
2457:     {
2458:       /* Should I check the return value of MPI_Wait() or status? */
2459:       /* Can this loop be replaced by a call to MPI_Waitall()? */
2460:       MPI_Wait(ids_in++, &status);
2461:       while (*iptr >= 0)
2462:         {*(dptr1 + *iptr) = PetscMax(*(dptr1 + *iptr),*in2); iptr++; in2++;}
2463:     }

2465:   /* replace vals */
2466:   while (*pw >= 0)
2467:     {*(in_vals + *pw++) = *dptr1++;}

2469:   /* clear isend message handles */
2470:   /* This changed for clarity though it could be the same */
2471:   while (*msg_nodes++)
2472:     /* Should I check the return value of MPI_Wait() or status? */
2473:     /* Can this loop be replaced by a call to MPI_Waitall()? */
2474:     {MPI_Wait(ids_out++, &status);}
2475: }



2479: /******************************************************************************
2480: Function: gather_scatter

2482: Input : 
2483: Output: 
2484: Return: 
2485: Description: 
2486: ******************************************************************************/
2487: static
2488: void
2489: gs_gop_tree_max(gs_id *gs, REAL *vals)
2490: {
2491:   int size;
2492:   int *in, *out;
2493:   REAL *buf, *work;
2494:   /* int op[] = {GL_MAX,0}; */


2497: #ifdef DEBUG
2498:   error_msg_warning("start gs_gop_tree_max()");
2499: #endif
2500: 
2501:   in   = gs->tree_map_in;
2502:   out  = gs->tree_map_out;
2503:   buf  = gs->tree_buf;
2504:   work = gs->tree_work;
2505:   size = gs->tree_nel;

2507: #if defined BLAS||CBLAS
2508:   *work = -REAL_MAX;
2509:   copy(size,work,0,buf,1);
2510: #else
2511:   rvec_set(buf,-REAL_MAX,size);
2512: #endif

2514:   while (*in >= 0)
2515:     {*(buf + *out++) = *(vals + *in++);}

2517:   in   = gs->tree_map_in;
2518:   out  = gs->tree_map_out;
2519:   MPI_Allreduce(buf,work,size,REAL_TYPE,MPI_MAX,gs->gs_comm);
2520:   while (*in >= 0)
2521:     {*(vals + *in++) = *(work + *out++);}

2523: #ifdef DEBUG
2524:   error_msg_warning("end gs_gop_tree_max()");
2525: #endif
2526: }



2530: /******************************************************************************
2531: Function: gather_scatter

2533: Input : 
2534: Output: 
2535: Return: 
2536: Description: 
2537: ******************************************************************************/
2538: static void
2539: gs_gop_min_abs(register gs_id *gs, register REAL *vals)
2540: {
2541: #ifdef DEBUG
2542:   error_msg_warning("start gs_gop_xxx()\n");
2543: #endif

2545:   /* local only operations!!! */
2546:   if (gs->num_local)
2547:     {gs_gop_local_min_abs(gs,vals);}

2549:   /* if intersection tree/pairwise and local isn't empty */
2550:   if (gs->num_local_gop)
2551:     {
2552:       gs_gop_local_in_min_abs(gs,vals);

2554:       /* pairwise */
2555:       if (gs->num_pairs)
2556:         {gs_gop_pairwise_min_abs(gs,vals);}
2557: 
2558:       /* tree */
2559:       else if (gs->max_left_over)
2560:         {gs_gop_tree_min_abs(gs,vals);}
2561: 
2562:       gs_gop_local_out(gs,vals);
2563:     }
2564:   /* if intersection tree/pairwise and local is empty */
2565:   else
2566:     {
2567:       /* pairwise */
2568:       if (gs->num_pairs)
2569:         {gs_gop_pairwise_min_abs(gs,vals);}
2570: 
2571:       /* tree */
2572:       else if (gs->max_left_over)
2573:         {gs_gop_tree_min_abs(gs,vals);}
2574:     }
2575: }



2579: /******************************************************************************
2580: Function: gather_scatter

2582: Input : 
2583: Output: 
2584: Return: 
2585: Description: 
2586: ******************************************************************************/
2587: static
2588: void
2589: gs_gop_local_min_abs(register gs_id *gs, register REAL *vals)
2590: {
2591:   register int *num, *map, **reduce;
2592:   register REAL tmp;


2595: #ifdef DEBUG
2596:   error_msg_warning("start gs_gop_xxx()\n");
2597: #endif

2599:   num    = gs->num_local_reduce;
2600:   reduce = gs->local_reduce;
2601:   while ((map = *reduce))
2602:     {
2603:       num ++;
2604:       tmp = REAL_MAX;
2605:       while (*map >= 0)
2606:         {tmp = MIN_FABS(tmp,*(vals + *map)); map++;}
2607: 
2608:       map = *reduce++;
2609:       while (*map >= 0)
2610:         {*(vals + *map++) = tmp;}
2611:     }
2612: }



2616: /******************************************************************************
2617: Function: gather_scatter

2619: Input : 
2620: Output: 
2621: Return: 
2622: Description: 
2623: ******************************************************************************/
2624: static
2625: void
2626: gs_gop_local_in_min_abs(register gs_id *gs, register REAL *vals)
2627: {
2628:   register int *num, *map, **reduce;
2629:   register REAL *base;

2631: #ifdef DEBUG
2632:   error_msg_warning("start gs_gop_xxx()\n");
2633: #endif

2635:   num    = gs->num_gop_local_reduce;
2636:   reduce = gs->gop_local_reduce;
2637:   while ((map = *reduce++))
2638:     {
2639:       num++;
2640:       base = vals + *map++;
2641:       while (*map >= 0)
2642:         {*base = MIN_FABS(*base,*(vals + *map)); map++;}
2643:     }
2644: }



2648: /******************************************************************************
2649: Function: gather_scatter

2651: VERSION 3 :: 

2653: Input : 
2654: Output: 
2655: Return: 
2656: Description: 
2657: ******************************************************************************/
2658: static
2659: void
2660: gs_gop_pairwise_min_abs(register gs_id *gs, register REAL *in_vals)
2661: {
2662:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
2663:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
2664:   register int *pw, *list, *size, **nodes;
2665:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
2666:   MPI_Status status;


2669:   /* strip and load registers */
2670:   msg_list =list         = gs->pair_list;
2671:   msg_size =size         = gs->msg_sizes;
2672:   msg_nodes=nodes        = gs->node_list;
2673:   iptr=pw                = gs->pw_elm_list;
2674:   dptr1=dptr3            = gs->pw_vals;
2675:   msg_ids_in  = ids_in   = gs->msg_ids_in;
2676:   msg_ids_out = ids_out  = gs->msg_ids_out;
2677:   dptr2                  = gs->out;
2678:   in1=in2                = gs->in;

2680:   /* post the receives */
2681:   /*  msg_nodes=nodes; */
2682:   do
2683:     {
2684:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
2685:          second one *list and do list++ afterwards */
2686:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
2687:                 gs->gs_comm, msg_ids_in++);
2688:       in1 += *size++;
2689:     }
2690:   while (*++msg_nodes);
2691:   msg_nodes=nodes;

2693:   /* load gs values into in out gs buffers */
2694:   while (*iptr >= 0)
2695:     {*dptr3++ = *(in_vals + *iptr++);}

2697:   /* load out buffers and post the sends */
2698:   while ((iptr = *msg_nodes++))
2699:     {
2700:       dptr3 = dptr2;
2701:       while (*iptr >= 0)
2702:         {*dptr2++ = *(dptr1 + *iptr++);}
2703:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
2704:       /* is msg_ids_out++ correct? */
2705:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
2706:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
2707:     }

2709:   if (gs->max_left_over)
2710:     {gs_gop_tree_min_abs(gs,in_vals);}

2712:   /* process the received data */
2713:   msg_nodes=nodes;
2714:   while ((iptr = *nodes++))
2715:     {
2716:       /* Should I check the return value of MPI_Wait() or status? */
2717:       /* Can this loop be replaced by a call to MPI_Waitall()? */
2718:       MPI_Wait(ids_in++, &status);
2719:       while (*iptr >= 0)
2720:         {*(dptr1 + *iptr) = MIN_FABS(*(dptr1 + *iptr),*in2); iptr++; in2++;}
2721:     }

2723:   /* replace vals */
2724:   while (*pw >= 0)
2725:     {*(in_vals + *pw++) = *dptr1++;}

2727:   /* clear isend message handles */
2728:   /* This changed for clarity though it could be the same */
2729:   while (*msg_nodes++)
2730:     /* Should I check the return value of MPI_Wait() or status? */
2731:     /* Can this loop be replaced by a call to MPI_Waitall()? */
2732:     {MPI_Wait(ids_out++, &status);}
2733: }



2737: /******************************************************************************
2738: Function: gather_scatter

2740: Input : 
2741: Output: 
2742: Return: 
2743: Description: 
2744: ******************************************************************************/
2745: static
2746: void
2747: gs_gop_tree_min_abs(gs_id *gs, REAL *vals)
2748: {
2749:   int size;
2750:   int *in, *out;
2751:   REAL *buf, *work;
2752:   int op[] = {GL_MIN_ABS,0};


2755: #ifdef DEBUG
2756:   error_msg_warning("start gs_gop_tree_min_abs()");
2757: #endif
2758: 
2759:   in   = gs->tree_map_in;
2760:   out  = gs->tree_map_out;
2761:   buf  = gs->tree_buf;
2762:   work = gs->tree_work;
2763:   size = gs->tree_nel;

2765: #if defined  BLAS||CBLAS
2766:   *work = REAL_MAX;
2767:   copy(size,work,0,buf,1);
2768: #else
2769:   rvec_set(buf,REAL_MAX,size);
2770: #endif

2772:   while (*in >= 0)
2773:     {*(buf + *out++) = *(vals + *in++);}

2775:   in   = gs->tree_map_in;
2776:   out  = gs->tree_map_out;
2777:   grop(buf,work,size,op);
2778:   while (*in >= 0)
2779:     {*(vals + *in++) = *(buf + *out++);}

2781: #ifdef DEBUG
2782:   error_msg_warning("end gs_gop_tree_min_abs()");
2783: #endif
2784: }



2788: /******************************************************************************
2789: Function: gather_scatter

2791: Input : 
2792: Output: 
2793: Return: 
2794: Description: 
2795: ******************************************************************************/
2796: static void
2797: gs_gop_min(register gs_id *gs, register REAL *vals)
2798: {
2799: #ifdef DEBUG
2800:   error_msg_warning("start gs_gop_xxx()\n");
2801: #endif

2803:   /* local only operations!!! */
2804:   if (gs->num_local)
2805:     {gs_gop_local_min(gs,vals);}

2807:   /* if intersection tree/pairwise and local isn't empty */
2808:   if (gs->num_local_gop)
2809:     {
2810:       gs_gop_local_in_min(gs,vals);

2812:       /* pairwise */
2813:       if (gs->num_pairs)
2814:         {gs_gop_pairwise_min(gs,vals);}
2815: 
2816:       /* tree */
2817:       else if (gs->max_left_over)
2818:         {gs_gop_tree_min(gs,vals);}
2819: 
2820:       gs_gop_local_out(gs,vals);
2821:     }
2822:   /* if intersection tree/pairwise and local is empty */
2823:   else
2824:     {
2825:       /* pairwise */
2826:       if (gs->num_pairs)
2827:         {gs_gop_pairwise_min(gs,vals);}
2828: 
2829:       /* tree */
2830:       else if (gs->max_left_over)
2831:         {gs_gop_tree_min(gs,vals);}
2832:     }
2833: #ifdef DEBUG
2834:   error_msg_warning("end gs_gop_xxx()\n");
2835: #endif
2836: }



2840: /******************************************************************************
2841: Function: gather_scatter

2843: Input : 
2844: Output: 
2845: Return: 
2846: Description: 
2847: ******************************************************************************/
2848: static
2849: void
2850: gs_gop_local_min(register gs_id *gs, register REAL *vals)
2851: {
2852:   register int *num, *map, **reduce;
2853:   register REAL tmp;


2856: #ifdef DEBUG
2857:   error_msg_warning("start gs_gop_xxx()\n");
2858: #endif

2860:   num    = gs->num_local_reduce;
2861:   reduce = gs->local_reduce;
2862:   while ((map = *reduce))
2863:     {
2864:       num ++;
2865:       tmp = REAL_MAX;
2866:       while (*map >= 0)
2867:         {tmp = PetscMin(tmp,*(vals + *map)); map++;}
2868: 
2869:       map = *reduce++;
2870:       while (*map >= 0)
2871:         {*(vals + *map++) = tmp;}
2872:     }
2873: }



2877: /******************************************************************************
2878: Function: gather_scatter

2880: Input : 
2881: Output: 
2882: Return: 
2883: Description: 
2884: ******************************************************************************/
2885: static
2886: void
2887: gs_gop_local_in_min(register gs_id *gs, register REAL *vals)
2888: {
2889:   register int *num, *map, **reduce;
2890:   register REAL *base;


2893: #ifdef DEBUG
2894:   error_msg_warning("start gs_gop_xxx()\n");
2895: #endif

2897:   num    = gs->num_gop_local_reduce;
2898:   reduce = gs->gop_local_reduce;
2899:   while ((map = *reduce++))
2900:     {
2901:       num++;
2902:       base = vals + *map++;
2903:       while (*map >= 0)
2904:         {*base = PetscMin(*base,*(vals + *map)); map++;}
2905:     }
2906: }



2910: /******************************************************************************
2911: Function: gather_scatter

2913: VERSION 3 :: 

2915: Input : 
2916: Output: 
2917: Return: 
2918: Description: 
2919: ******************************************************************************/
2920: static
2921: void
2922: gs_gop_pairwise_min(register gs_id *gs, register REAL *in_vals)
2923: {
2924:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
2925:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
2926:   register int *pw, *list, *size, **nodes;
2927:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
2928:   MPI_Status status;


2931:   /* strip and load registers */
2932:   msg_list =list         = gs->pair_list;
2933:   msg_size =size         = gs->msg_sizes;
2934:   msg_nodes=nodes        = gs->node_list;
2935:   iptr=pw                = gs->pw_elm_list;
2936:   dptr1=dptr3            = gs->pw_vals;
2937:   msg_ids_in  = ids_in   = gs->msg_ids_in;
2938:   msg_ids_out = ids_out  = gs->msg_ids_out;
2939:   dptr2                  = gs->out;
2940:   in1=in2                = gs->in;

2942:   /* post the receives */
2943:   /*  msg_nodes=nodes; */
2944:   do
2945:     {
2946:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
2947:          second one *list and do list++ afterwards */
2948:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
2949:                 gs->gs_comm, msg_ids_in++);
2950:       in1 += *size++;
2951:     }
2952:   while (*++msg_nodes);
2953:   msg_nodes=nodes;

2955:   /* load gs values into in out gs buffers */
2956:   while (*iptr >= 0)
2957:     {*dptr3++ = *(in_vals + *iptr++);}

2959:   /* load out buffers and post the sends */
2960:   while ((iptr = *msg_nodes++))
2961:     {
2962:       dptr3 = dptr2;
2963:       while (*iptr >= 0)
2964:         {*dptr2++ = *(dptr1 + *iptr++);}
2965:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
2966:       /* is msg_ids_out++ correct? */
2967:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
2968:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
2969:     }

2971:   /* process the received data */
2972:   if (gs->max_left_over)
2973:     {gs_gop_tree_min(gs,in_vals);}

2975:   msg_nodes=nodes;
2976:   while ((iptr = *nodes++))
2977:     {
2978:       /* Should I check the return value of MPI_Wait() or status? */
2979:       /* Can this loop be replaced by a call to MPI_Waitall()? */
2980:       MPI_Wait(ids_in++, &status);
2981:       while (*iptr >= 0)
2982:         {*(dptr1 + *iptr) = PetscMin(*(dptr1 + *iptr),*in2); iptr++; in2++;}
2983:     }

2985:   /* replace vals */
2986:   while (*pw >= 0)
2987:     {*(in_vals + *pw++) = *dptr1++;}

2989:   /* clear isend message handles */
2990:   /* This changed for clarity though it could be the same */
2991:   while (*msg_nodes++)
2992:     /* Should I check the return value of MPI_Wait() or status? */
2993:     /* Can this loop be replaced by a call to MPI_Waitall()? */
2994:     {MPI_Wait(ids_out++, &status);}
2995: }



2999: /******************************************************************************
3000: Function: gather_scatter

3002: Input : 
3003: Output: 
3004: Return: 
3005: Description: 
3006: ******************************************************************************/
3007: static
3008: void
3009: gs_gop_tree_min(gs_id *gs, REAL *vals)
3010: {
3011:   int size;
3012:   int *in, *out;
3013:   REAL *buf, *work;
3014:   /*int op[] = {GL_MIN,0};*/


3017: #ifdef DEBUG
3018:   error_msg_warning("start gs_gop_tree_min()");
3019: #endif
3020: 
3021:   in   = gs->tree_map_in;
3022:   out  = gs->tree_map_out;
3023:   buf  = gs->tree_buf;
3024:   work = gs->tree_work;
3025:   size = gs->tree_nel;

3027: #if defined  BLAS||CBLAS
3028:   *work = REAL_MAX;
3029:   copy(size,work,0,buf,1);
3030: #else
3031:   rvec_set(buf,REAL_MAX,size);
3032: #endif

3034:   while (*in >= 0)
3035:     {*(buf + *out++) = *(vals + *in++);}

3037:   in   = gs->tree_map_in;
3038:   out  = gs->tree_map_out;
3039:   MPI_Allreduce(buf,work,size,REAL_TYPE,MPI_MIN,gs->gs_comm);
3040:   while (*in >= 0)
3041:     {*(vals + *in++) = *(work + *out++);}

3043: #ifdef DEBUG
3044:   error_msg_warning("end gs_gop_tree_min()");
3045: #endif
3046: }



3050: /******************************************************************************
3051: Function: gather_scatter

3053: Input : 
3054: Output: 
3055: Return: 
3056: Description: 
3057: ******************************************************************************/
3058: static void
3059: gs_gop_times(register gs_id *gs, register REAL *vals)
3060: {
3061: #ifdef DEBUG  
3062:   error_msg_warning("start gs_gop_times()\n");
3063: #endif

3065:   /* local only operations!!! */
3066:   if (gs->num_local)
3067:     {gs_gop_local_times(gs,vals);}

3069:   /* if intersection tree/pairwise and local isn't empty */
3070:   if (gs->num_local_gop)
3071:     {
3072:       gs_gop_local_in_times(gs,vals);

3074:       /* pairwise */
3075:       if (gs->num_pairs)
3076:         {gs_gop_pairwise_times(gs,vals);}
3077: 
3078:       /* tree */
3079:       else if (gs->max_left_over)
3080:         {gs_gop_tree_times(gs,vals);}
3081: 
3082:       gs_gop_local_out(gs,vals);
3083:     }
3084:   /* if intersection tree/pairwise and local is empty */
3085:   else
3086:     {
3087:       /* pairwise */
3088:       if (gs->num_pairs)
3089:         {gs_gop_pairwise_times(gs,vals);}
3090: 
3091:       /* tree */
3092:       else if (gs->max_left_over)
3093:         {gs_gop_tree_times(gs,vals);}
3094:     }
3095: }



3099: /******************************************************************************
3100: Function: gather_scatter

3102: Input : 
3103: Output: 
3104: Return: 
3105: Description: 
3106: ******************************************************************************/
3107: static
3108: void
3109: gs_gop_local_times(register gs_id *gs, register REAL *vals)
3110: {
3111:   register int *num, *map, **reduce;
3112:   register REAL tmp;


3115: #ifdef DEBUG  
3116:   error_msg_warning("start gs_gop_xxx()\n");
3117: #endif

3119:   num    = gs->num_local_reduce;
3120:   reduce = gs->local_reduce;
3121:   while ((map = *reduce))
3122:     {
3123:       /* wall */
3124:       if (*num == 2)
3125:         {
3126:           num ++; reduce++;
3127:           vals[map[1]] = vals[map[0]] *= vals[map[1]];
3128:         }
3129:       /* corner shared by three elements */
3130:       else if (*num == 3)
3131:         {
3132:           num ++; reduce++;
3133:           vals[map[2]]=vals[map[1]]=vals[map[0]]*=(vals[map[1]]*vals[map[2]]);
3134:         }
3135:       /* corner shared by four elements */
3136:       else if (*num == 4)
3137:         {
3138:           num ++; reduce++;
3139:           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] *=
3140:                                  (vals[map[1]] * vals[map[2]] * vals[map[3]]);
3141:         }
3142:       /* general case ... odd geoms ... 3D*/
3143:       else
3144:         {
3145:           num ++;
3146:            tmp = 1.0;
3147:           while (*map >= 0)
3148:             {tmp *= *(vals + *map++);}

3150:           map = *reduce++;
3151:           while (*map >= 0)
3152:             {*(vals + *map++) = tmp;}
3153:         }
3154:     }
3155: }



3159: /******************************************************************************
3160: Function: gather_scatter

3162: Input : 
3163: Output: 
3164: Return: 
3165: Description: 
3166: ******************************************************************************/
3167: static
3168: void
3169: gs_gop_local_in_times(register gs_id *gs, register REAL *vals)
3170: {
3171:   register int *num, *map, **reduce;
3172:   register REAL *base;


3175: #ifdef DEBUG  
3176:   error_msg_warning("start gs_gop_xxx()\n");
3177: #endif

3179:   num    = gs->num_gop_local_reduce;
3180:   reduce = gs->gop_local_reduce;
3181:   while ((map = *reduce++))
3182:     {
3183:       /* wall */
3184:       if (*num == 2)
3185:         {
3186:           num ++;
3187:           vals[map[0]] *= vals[map[1]];
3188:         }
3189:       /* corner shared by three elements */
3190:       else if (*num == 3)
3191:         {
3192:           num ++;
3193:           vals[map[0]] *= (vals[map[1]] * vals[map[2]]);
3194:         }
3195:       /* corner shared by four elements */
3196:       else if (*num == 4)
3197:         {
3198:           num ++;
3199:           vals[map[0]] *= (vals[map[1]] * vals[map[2]] * vals[map[3]]);
3200:         }
3201:       /* general case ... odd geoms ... 3D*/
3202:       else
3203:         {
3204:           num++;
3205:           base = vals + *map++;
3206:           while (*map >= 0)
3207:             {*base *= *(vals + *map++);}
3208:         }
3209:     }
3210: }



3214: /******************************************************************************
3215: Function: gather_scatter

3217: VERSION 3 :: 

3219: Input : 
3220: Output: 
3221: Return: 
3222: Description: 
3223: ******************************************************************************/
3224: static
3225: void
3226: gs_gop_pairwise_times(register gs_id *gs, register REAL *in_vals)
3227: {
3228:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
3229:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
3230:   register int *pw, *list, *size, **nodes;
3231:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
3232:   MPI_Status status;


3235:   /* strip and load registers */
3236:   msg_list =list         = gs->pair_list;
3237:   msg_size =size         = gs->msg_sizes;
3238:   msg_nodes=nodes        = gs->node_list;
3239:   iptr=pw                = gs->pw_elm_list;
3240:   dptr1=dptr3            = gs->pw_vals;
3241:   msg_ids_in  = ids_in   = gs->msg_ids_in;
3242:   msg_ids_out = ids_out  = gs->msg_ids_out;
3243:   dptr2                  = gs->out;
3244:   in1=in2                = gs->in;

3246:   /* post the receives */
3247:   /*  msg_nodes=nodes; */
3248:   do
3249:     {
3250:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
3251:          second one *list and do list++ afterwards */
3252:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
3253:                 gs->gs_comm, msg_ids_in++);
3254:       in1 += *size++;
3255:     }
3256:   while (*++msg_nodes);
3257:   msg_nodes=nodes;

3259:   /* load gs values into in out gs buffers */
3260:   while (*iptr >= 0)
3261:     {*dptr3++ = *(in_vals + *iptr++);}

3263:   /* load out buffers and post the sends */
3264:   while ((iptr = *msg_nodes++))
3265:     {
3266:       dptr3 = dptr2;
3267:       while (*iptr >= 0)
3268:         {*dptr2++ = *(dptr1 + *iptr++);}
3269:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
3270:       /* is msg_ids_out++ correct? */
3271:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
3272:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
3273:     }

3275:   if (gs->max_left_over)
3276:     {gs_gop_tree_times(gs,in_vals);}

3278:   /* process the received data */
3279:   msg_nodes=nodes;
3280:   while ((iptr = *nodes++))
3281:     {
3282:       /* Should I check the return value of MPI_Wait() or status? */
3283:       /* Can this loop be replaced by a call to MPI_Waitall()? */
3284:       MPI_Wait(ids_in++, &status);
3285:       while (*iptr >= 0)
3286:         {*(dptr1 + *iptr++) *= *in2++;}
3287:     }

3289:   /* replace vals */
3290:   while (*pw >= 0)
3291:     {*(in_vals + *pw++) = *dptr1++;}

3293:   /* clear isend message handles */
3294:   /* This changed for clarity though it could be the same */
3295:   while (*msg_nodes++)
3296:     /* Should I check the return value of MPI_Wait() or status? */
3297:     /* Can this loop be replaced by a call to MPI_Waitall()? */
3298:     {MPI_Wait(ids_out++, &status);}
3299: }



3303: /******************************************************************************
3304: Function: gather_scatter

3306: Input : 
3307: Output: 
3308: Return: 
3309: Description: 
3310: ******************************************************************************/
3311: static
3312: void
3313: gs_gop_tree_times(gs_id *gs, REAL *vals)
3314: {
3315:   int size;
3316:   int *in, *out;
3317:   REAL *buf, *work;
3318:   /*int op[] = {GL_MULT,0};*/


3321: #ifdef DEBUG
3322:   error_msg_warning("start gs_gop_tree_times()");
3323: #endif
3324: 
3325:   in   = gs->tree_map_in;
3326:   out  = gs->tree_map_out;
3327:   buf  = gs->tree_buf;
3328:   work = gs->tree_work;
3329:   size = gs->tree_nel;

3331: #if defined  BLAS||CBLAS
3332:   *work = 1.0;
3333:   copy(size,work,0,buf,1);
3334: #else
3335:   rvec_one(buf,size);
3336: #endif

3338:   while (*in >= 0)
3339:     {*(buf + *out++) = *(vals + *in++);}

3341:   in   = gs->tree_map_in;
3342:   out  = gs->tree_map_out;
3343:   MPI_Allreduce(buf,work,size,REAL_TYPE,MPI_PROD,gs->gs_comm);
3344:   while (*in >= 0)
3345:     {*(vals + *in++) = *(work + *out++);}

3347: #ifdef DEBUG
3348:   error_msg_warning("end gs_gop_tree_times()");
3349: #endif
3350: }



3354: /******************************************************************************
3355: Function: gather_scatter


3358: Input : 
3359: Output: 
3360: Return: 
3361: Description: 
3362: ******************************************************************************/
3363: static void
3364: gs_gop_plus(register gs_id *gs, register REAL *vals)
3365: {
3366: #ifdef DEBUG
3367:   error_msg_warning("start gs_gop_plus()\n");
3368: #endif

3370:   /* local only operations!!! */
3371:   if (gs->num_local)
3372:     {gs_gop_local_plus(gs,vals);}

3374:   /* if intersection tree/pairwise and local isn't empty */
3375:   if (gs->num_local_gop)
3376:     {
3377:       gs_gop_local_in_plus(gs,vals);

3379:       /* pairwise will NOT do tree inside ... */
3380:       if (gs->num_pairs)
3381:         {gs_gop_pairwise_plus(gs,vals);}

3383:       /* tree */
3384:       if (gs->max_left_over)
3385:         {gs_gop_tree_plus(gs,vals);}
3386: 
3387:       gs_gop_local_out(gs,vals);
3388:     }
3389:   /* if intersection tree/pairwise and local is empty */
3390:   else
3391:     {
3392:       /* pairwise will NOT do tree inside */
3393:       if (gs->num_pairs)
3394:         {gs_gop_pairwise_plus(gs,vals);}
3395: 
3396:       /* tree */
3397:       if (gs->max_left_over)
3398:         {gs_gop_tree_plus(gs,vals);}
3399:     }

3401: #ifdef DEBUG
3402:   error_msg_warning("end gs_gop_plus()\n");
3403: #endif
3404: }



3408: /******************************************************************************
3409: Function: gather_scatter

3411: Input : 
3412: Output: 
3413: Return: 
3414: Description: 
3415: ******************************************************************************/
3416: static
3417: void
3418: gs_gop_local_plus(register gs_id *gs, register REAL *vals)
3419: {
3420:   register int *num, *map, **reduce;
3421:   register REAL tmp;


3424: #ifdef DEBUG
3425:   error_msg_warning("begin gs_gop_local_plus()\n");
3426: #endif

3428:   num    = gs->num_local_reduce;
3429:   reduce = gs->local_reduce;
3430:   while ((map = *reduce))
3431:     {
3432:       /* wall */
3433:       if (*num == 2)
3434:         {
3435:           num ++; reduce++;
3436:           vals[map[1]] = vals[map[0]] += vals[map[1]];
3437:         }
3438:       /* corner shared by three elements */
3439:       else if (*num == 3)
3440:         {
3441:           num ++; reduce++;
3442:           vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
3443:         }
3444:       /* corner shared by four elements */
3445:       else if (*num == 4)
3446:         {
3447:           num ++; reduce++;
3448:           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
3449:                                  (vals[map[1]] + vals[map[2]] + vals[map[3]]);
3450:         }
3451:       /* general case ... odd geoms ... 3D*/
3452:       else
3453:         {
3454:           num ++;
3455:            tmp = 0.0;
3456:           while (*map >= 0)
3457:             {tmp += *(vals + *map++);}

3459:           map = *reduce++;
3460:           while (*map >= 0)
3461:             {*(vals + *map++) = tmp;}
3462:         }
3463:     }
3464: #ifdef DEBUG
3465:   error_msg_warning("end gs_gop_local_plus()\n");
3466: #endif
3467: }



3471: /******************************************************************************
3472: Function: gather_scatter

3474: Input : 
3475: Output: 
3476: Return: 
3477: Description: 
3478: ******************************************************************************/
3479: static
3480: void
3481: gs_gop_local_in_plus(register gs_id *gs, register REAL *vals)
3482: {
3483:   register int *num, *map, **reduce;
3484:   register REAL *base;


3487: #ifdef DEBUG
3488:   error_msg_warning("begin gs_gop_local_in_plus()\n");
3489: #endif

3491:   num    = gs->num_gop_local_reduce;
3492:   reduce = gs->gop_local_reduce;
3493:   while ((map = *reduce++))
3494:     {
3495:       /* wall */
3496:       if (*num == 2)
3497:         {
3498:           num ++;
3499:           vals[map[0]] += vals[map[1]];
3500:         }
3501:       /* corner shared by three elements */
3502:       else if (*num == 3)
3503:         {
3504:           num ++;
3505:           vals[map[0]] += (vals[map[1]] + vals[map[2]]);
3506:         }
3507:       /* corner shared by four elements */
3508:       else if (*num == 4)
3509:         {
3510:           num ++;
3511:           vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
3512:         }
3513:       /* general case ... odd geoms ... 3D*/
3514:       else
3515:         {
3516:           num++;
3517:           base = vals + *map++;
3518:           while (*map >= 0)
3519:             {*base += *(vals + *map++);}
3520:         }
3521:     }
3522: #ifdef DEBUG
3523:   error_msg_warning("end gs_gop_local_in_plus()\n");
3524: #endif
3525: }



3529: /******************************************************************************
3530: Function: gather_scatter

3532: VERSION 3 :: 

3534: Input : 
3535: Output: 
3536: Return: 
3537: Description: 
3538: ******************************************************************************/
3539: static
3540: void
3541: gs_gop_pairwise_plus(register gs_id *gs, register REAL *in_vals)
3542: {
3543:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
3544:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
3545:   register int *pw, *list, *size, **nodes;
3546:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
3547:   MPI_Status status;


3550: #ifdef DEBUG
3551:   error_msg_warning("gs_gop_pairwise_plus() start\n");
3552: #endif

3554:   /* strip and load registers */
3555:   msg_list =list         = gs->pair_list;
3556:   msg_size =size         = gs->msg_sizes;
3557:   msg_nodes=nodes        = gs->node_list;
3558:   iptr=pw                = gs->pw_elm_list;
3559:   dptr1=dptr3            = gs->pw_vals;
3560:   msg_ids_in  = ids_in   = gs->msg_ids_in;
3561:   msg_ids_out = ids_out  = gs->msg_ids_out;
3562:   dptr2                  = gs->out;
3563:   in1=in2                = gs->in;

3565:   /* post the receives */
3566:   /*  msg_nodes=nodes; */
3567:   do
3568:     {
3569:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
3570:          second one *list and do list++ afterwards */
3571:       MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
3572:                 gs->gs_comm, msg_ids_in++);
3573:       in1 += *size++;
3574:     }
3575:   while (*++msg_nodes);
3576:   msg_nodes=nodes;

3578:   /* load gs values into in out gs buffers */
3579:   while (*iptr >= 0)
3580:     {*dptr3++ = *(in_vals + *iptr++);}

3582:   /* load out buffers and post the sends */
3583:   while ((iptr = *msg_nodes++))
3584:     {
3585:       dptr3 = dptr2;
3586:       while (*iptr >= 0)
3587:         {*dptr2++ = *(dptr1 + *iptr++);}
3588:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
3589:       /* is msg_ids_out++ correct? */
3590:       MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *msg_list++,
3591:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
3592:     }

3594:   /* do the tree while we're waiting */
3595:   if (gs->max_left_over)
3596:     {gs_gop_tree_plus(gs,in_vals);}

3598:   /* process the received data */
3599:   msg_nodes=nodes;
3600:   while ((iptr = *nodes++))
3601:     {
3602:       /* Should I check the return value of MPI_Wait() or status? */
3603:       /* Can this loop be replaced by a call to MPI_Waitall()? */
3604:       MPI_Wait(ids_in++, &status);
3605:       while (*iptr >= 0)
3606:         {*(dptr1 + *iptr++) += *in2++;}
3607:     }

3609:   /* replace vals */
3610:   while (*pw >= 0)
3611:     {*(in_vals + *pw++) = *dptr1++;}

3613:   /* clear isend message handles */
3614:   /* This changed for clarity though it could be the same */
3615:   while (*msg_nodes++)
3616:     /* Should I check the return value of MPI_Wait() or status? */
3617:     /* Can this loop be replaced by a call to MPI_Waitall()? */
3618:     {MPI_Wait(ids_out++, &status);}

3620: #ifdef DEBUG
3621:   error_msg_warning("gs_gop_pairwise_plus() end\n");
3622: #endif

3624: }



3628: /******************************************************************************
3629: Function: gather_scatter

3631: Input : 
3632: Output: 
3633: Return: 
3634: Description: 
3635: ******************************************************************************/
3636: static
3637: void
3638: gs_gop_tree_plus(gs_id *gs, REAL *vals)
3639: {
3640:   int size;
3641:   int *in, *out;
3642:   REAL *buf, *work;
3643:   /*int op[] = {GL_ADD,0}; */


3646: #ifdef DEBUG
3647:   error_msg_warning("start gs_gop_tree_plus()\n");
3648: #endif
3649: 
3650:   in   = gs->tree_map_in;
3651:   out  = gs->tree_map_out;
3652:   buf  = gs->tree_buf;
3653:   work = gs->tree_work;
3654:   size = gs->tree_nel;

3656: #if defined  BLAS||CBLAS
3657:   *work = 0.0;
3658:   copy(size,work,0,buf,1);
3659: #else
3660:   rvec_zero(buf,size);
3661: #endif

3663:   while (*in >= 0)
3664:     {*(buf + *out++) = *(vals + *in++);}

3666:   in   = gs->tree_map_in;
3667:   out  = gs->tree_map_out;
3668:   MPI_Allreduce(buf,work,size,REAL_TYPE,MPI_SUM,gs->gs_comm);
3669:   while (*in >= 0)
3670:     {*(vals + *in++) = *(work + *out++);}

3672: #ifdef DEBUG
3673:   error_msg_warning("end gs_gop_tree_plus()\n");
3674: #endif
3675: }



3679: /******************************************************************************
3680: Function: level_best_guess()

3682: Input : 
3683: Output: 
3684: Return: 
3685: Description: 
3686: ******************************************************************************/
3687: #if defined(not_used)
3688: static
3689: PetscErrorCode level_best_guess(void)
3690: {
3691:   /* full pairwise for now */
3692:   return(num_nodes);
3693: }
3694: #endif


3697: /******************************************************************************
3698: Function: gs_print_template()

3700: Input : 

3702: Output: 

3704: Return: 

3706: Description:  
3707: ******************************************************************************/
3708: #if defined(not_used)
3709: static
3710: void
3711: gs_print_template(register gs_id* gs, int who)
3712: {
3713:   register int j, k, *iptr, *iptr2;

3715: 
3716:   if ((my_id == who) && (num_gs_ids))
3717:     {
3718:       printf("\n\nP#%d's GS#%d template:\n", my_id, gs->id);
3719:       printf("id=%d\n",          gs->id);
3720:       printf("nel(unique)=%d\n", gs->nel);
3721:       printf("nel_max=%d\n",     gs->nel_max);
3722:       printf("nel_min=%d\n",     gs->nel_min);
3723:       printf("nel_sum=%d\n",     gs->nel_sum);
3724:       printf("negl=%d\n",        gs->negl);
3725:       printf("gl_max=%d\n",      gs->gl_max);
3726:       printf("gl_min=%d\n",      gs->gl_min);
3727:       printf("elms ordered=%d\n",gs->ordered);
3728:       printf("repeats=%d\n",     gs->repeats);
3729:       printf("positive=%d\n",    gs->positive);
3730:       printf("elms=%ld\n",        (PTRINT) gs->elms);
3731:       printf("elms(total)=%ld\n", (PTRINT) gs->local_elms);
3732:       printf("vals=%ld\n",        (PTRINT) gs->vals);
3733:       printf("gl_bss_min=%d\n",  gs->gl_bss_min);
3734:       printf("gl_perm_min=%d\n", gs->gl_perm_min);
3735:       printf("level=%d\n",       gs->level);
3736:       printf("proc_mask_sz=%d\n",gs->mask_sz);
3737:       printf("sh_proc_mask=%ld\n",(PTRINT) gs->nghs);
3738:       printf("ngh_buf_size=%d\n",gs->ngh_buf_sz);
3739:       printf("ngh_buf=%ld\n",     (PTRINT) gs->ngh_buf);
3740:       printf("num_nghs=%d\n",    gs->num_nghs);
3741:       printf("max_nghs=%d\n",    gs->max_nghs);

3743:       /* pairwise exchange information */
3744:       printf("\nPaiwise Info:\n");
3745:       printf("num_pairs=%d\n",   gs->num_pairs);
3746:       printf("max_pairs=%d\n",   gs->max_pairs);
3747:       printf("len_pw_list=%d\n", gs->len_pw_list);
3748:       printf("pair_list=%ld\n",   (PTRINT) gs->pair_list);
3749:       printf("msg_sizes=%ld\n",   (PTRINT) gs->msg_sizes);
3750:       printf("node_list=%ld\n",   (PTRINT) gs->node_list);
3751:       printf("pw_elm_list=%ld\n", (PTRINT) gs->pw_elm_list);

3753:       printf("pw_elm_list: ");
3754:       if ((iptr = gs->pw_elm_list))
3755:         {
3756:           for (j=0;j<gs->len_pw_list;j++)
3757:             {printf("%d ", *iptr); iptr++;}
3758:         }
3759:       printf("\n");

3761:       printf("processor_list: ");
3762:       if ((iptr = gs->pair_list))
3763:         {
3764:           for (j=0;j<gs->num_pairs;j++)
3765:             {printf("%d ", *iptr); iptr++;}
3766:         }
3767:       printf("\n");

3769:       printf("loc_node_pairs=%d\n",   gs->loc_node_pairs);
3770:       printf("max_node_pairs=%d\n",   gs->max_node_pairs);
3771:       printf("min_node_pairs=%d\n",   gs->min_node_pairs);
3772:       printf("avg_node_pairs=%d\n",   gs->avg_node_pairs);

3774:       printf("size_list: ");
3775:       if ((iptr = gs->msg_sizes))
3776:         {
3777:           for (j=0;j<gs->num_pairs;j++)
3778:             {printf("%d ", *iptr); iptr++;}
3779:         }
3780:       printf("\n");
3781:       if ((iptr = gs->pair_list))
3782:         {
3783:           for (j=0;j<gs->num_pairs;j++)
3784:             {
3785:               printf("node_list %d: ", *iptr);
3786:               if ((iptr2 = (gs->node_list)[j]))
3787:                 {
3788:                   for (k=0;k<(gs->msg_sizes)[j];k++)
3789:                     {printf("%d ", *iptr2); iptr2++;}
3790:                 }
3791:               iptr++;
3792:               printf("\n");
3793:             }
3794:         }
3795:       printf("\n");
3796: 
3797:       printf("elm_list(U): ");
3798:       if ((iptr = gs->elms))
3799:         {
3800:           for (j=0;j<gs->nel;j++)
3801:             {printf("%d ", *iptr); iptr++;}
3802:         }
3803:       printf("\n");
3804:       printf("\n");
3805: 
3806:       printf("elm_list(T): ");
3807:       if ((iptr = gs->local_elms))
3808:         {
3809:           for (j=0;j<gs->nel_total;j++)
3810:             {printf("%d ", *iptr); iptr++;}
3811:         }
3812:       printf("\n");
3813:       printf("\n");
3814: 
3815:       printf("map_list(T): ");
3816:       if ((iptr = gs->companion))
3817:         {
3818:           for (j=0;j<gs->nel;j++)
3819:             {printf("%d ", *iptr); iptr++;}
3820:         }
3821:       printf("\n");
3822:       printf("\n");
3823: 

3825:       /* local exchange information */
3826:       printf("\nLocal Info:\n");
3827:       printf("local_strength=%d\n",   gs->local_strength);
3828:       printf("num_local_total=%d\n",  gs->num_local_total);
3829:       printf("num_local=%d\n",        gs->num_local);
3830:       printf("num_local_gop=%d\n",    gs->num_local_gop);
3831:       printf("num_local_reduce=%ld\n", (PTRINT) gs->num_local_reduce);
3832:       printf("local_reduce=%ld\n",     (PTRINT) gs->local_reduce);
3833:       printf("num_gop_local_reduce=%ld\n", (PTRINT) gs->num_gop_local_reduce);
3834:       printf("gop_local_reduce=%ld\n",     (PTRINT) gs->gop_local_reduce);
3835:       printf("\n");

3837:       for (j=0;j<gs->num_local;j++)
3838:         {
3839:           printf("local reduce_list %d: ", j);
3840:           if ((iptr2 = (gs->local_reduce)[j]))
3841:             {
3842:               if ((gs->num_local_reduce)[j] <= 0)
3843:                 {printf("oops");}
3844: 
3845:               for (k=0;k<(gs->num_local_reduce)[j];k++)
3846:                 {printf("%d ", *iptr2); iptr2++;}
3847:             }
3848:           printf("\n");
3849:         }
3850: 
3851:       printf("\n");
3852:       printf("\n");
3853: 
3854:       for (j=0;j<gs->num_local_gop;j++)
3855:         {
3856:           printf("gop reduce_list %d: ", j);
3857:           iptr2 = (gs->gop_local_reduce)[j];
3858: 
3859:           if ((gs->num_gop_local_reduce)[j] <= 0)
3860:             {printf("oops");}
3861: 

3863:           for (k=0;k<(gs->num_gop_local_reduce)[j];k++)
3864:             {printf("%d ", *iptr2); iptr2++;}
3865:           printf("\n");
3866:         }
3867:       printf("\n");
3868:       printf("\n");

3870:       /* crystal router information */
3871:       printf("\n\n");
3872:       printf("Tree Info:\n");
3873:       printf("max_left_over=%d\n",   gs->max_left_over);
3874:       printf("num_in_list=%ld\n",    (PTRINT) gs->in_num);
3875:       printf("in_list=%ld\n",        (PTRINT) gs->in_list);
3876:       printf("num_out_list=%ld\n",   (PTRINT) gs->out_num);
3877:       printf("out_list=%ld\n",       (PTRINT) gs->out_list);

3879:       printf("\n\n");
3880:     }
3881:   fflush(stdout);
3882: }
3883: #endif



3887: /******************************************************************************
3888: Function: gs_free()

3890: Input : 

3892: Output: 

3894: Return: 

3896: Description:  
3897:   if (gs->sss) {perm_free((void*) gs->sss);}
3898: ******************************************************************************/
3899: void
3900: gs_free(register gs_id *gs)
3901: {
3902:   register int i;


3905: #ifdef DEBUG  
3906:   error_msg_warning("start gs_gop_xxx()\n");
3907:   if (!gs) {error_msg_warning("NULL ptr passed to gs_free()"); return;}
3908: #endif

3910:   if (gs->nghs) {perm_free((void*) gs->nghs);}
3911:   if (gs->pw_nghs) {perm_free((void*) gs->pw_nghs);}

3913:   /* tree */
3914:   if (gs->max_left_over)
3915:     {
3916:       if (gs->tree_elms) {bss_free((void*) gs->tree_elms);}
3917:       if (gs->tree_buf) {bss_free((void*) gs->tree_buf);}
3918:       if (gs->tree_work) {bss_free((void*) gs->tree_work);}
3919:       if (gs->tree_map_in) {bss_free((void*) gs->tree_map_in);}
3920:       if (gs->tree_map_out) {bss_free((void*) gs->tree_map_out);}
3921:     }

3923:   /* pairwise info */
3924:   if (gs->num_pairs)
3925:     {
3926:       /* should be NULL already */
3927:       if (gs->ngh_buf) {bss_free((void*) gs->ngh_buf);}
3928:       if (gs->elms) {bss_free((void*) gs->elms);}
3929:       if (gs->local_elms) {bss_free((void*) gs->local_elms);}
3930:       if (gs->companion) {bss_free((void*) gs->companion);}
3931: 
3932:       /* only set if pairwise */
3933:       if (gs->vals) {perm_free((void*) gs->vals);}
3934:       if (gs->in) {perm_free((void*) gs->in);}
3935:       if (gs->out) {perm_free((void*) gs->out);}
3936:       if (gs->msg_ids_in) {perm_free((void*) gs->msg_ids_in);}
3937:       if (gs->msg_ids_out) {perm_free((void*) gs->msg_ids_out);}
3938:       if (gs->pw_vals) {perm_free((void*) gs->pw_vals);}
3939:       if (gs->pw_elm_list) {perm_free((void*) gs->pw_elm_list);}
3940:       if (gs->node_list)
3941:         {
3942:           for (i=0;i<gs->num_pairs;i++)
3943:             {if (gs->node_list[i]) {perm_free((void*) gs->node_list[i]);}}
3944:           perm_free((void*) gs->node_list);
3945:         }
3946:       if (gs->msg_sizes) {perm_free((void*) gs->msg_sizes);}
3947:       if (gs->pair_list) {perm_free((void*) gs->pair_list);}
3948:     }

3950:   /* local info */
3951:   if (gs->num_local_total>=0)
3952:     {
3953:       for (i=0;i<gs->num_local_total+1;i++)
3954:         /*      for (i=0;i<gs->num_local_total;i++) */
3955:         {
3956:           if (gs->num_gop_local_reduce[i])
3957:             {perm_free((void*) gs->gop_local_reduce[i]);}
3958:         }
3959:     }

3961:   /* if intersection tree/pairwise and local isn't empty */
3962:   if (gs->gop_local_reduce) {perm_free((void*) gs->gop_local_reduce);}
3963:   if (gs->num_gop_local_reduce) {perm_free((void*) gs->num_gop_local_reduce);}

3965:   perm_free((void*) gs);
3966: }






3973: /******************************************************************************
3974: Function: gather_scatter

3976: Input : 
3977: Output: 
3978: Return: 
3979: Description: 
3980: ******************************************************************************/
3981: void
3982: gs_gop_vec(register gs_id *gs, register REAL *vals, register const char *op, register int step)
3983: {
3984: #ifdef DEBUG
3985:   error_msg_warning("gs_gop_vec() start");
3986:   if (!gs) {error_msg_fatal("gs_gop_vec() :: passed NULL gs handle!!!");}
3987:   if (!op) {error_msg_fatal("gs_gop_vec() :: passed NULL operation!!!");}

3989:   /* check top make sure that segments being requested aren't larger */
3990:   /* then what I reserved earlier ... fix is to allow user to reset  */
3991:   if (step>gs->vec_sz)
3992:     {error_msg_fatal("gs_gop_vec() :: %d > %d!\n",step,gs->vec_sz);}
3993: #endif

3995:   switch (*op) {
3996:   case '+':
3997:     gs_gop_vec_plus(gs,vals,step);
3998:     break;
3999: #ifdef NOT_YET    
4000:   case '*':
4001:     gs_gop_times(gs,vals);
4002:     break;
4003:   case 'a':
4004:     gs_gop_min_abs(gs,vals);
4005:     break;
4006:   case 'A':
4007:     gs_gop_max_abs(gs,vals);
4008:     break;
4009:   case 'e':
4010:     gs_gop_exists(gs,vals);
4011:     break;
4012:   case 'm':
4013:     gs_gop_min(gs,vals);
4014:     break;
4015:   case 'M':
4016:     gs_gop_max(gs,vals); break;
4017:     /*
4018:     if (*(op+1)=='\0')
4019:       {gs_gop_max(gs,vals); break;}
4020:     else if (*(op+1)=='X')
4021:       {gs_gop_max_abs(gs,vals); break;}
4022:     else if (*(op+1)=='N')
4023:       {gs_gop_min_abs(gs,vals); break;}
4024:     */
4025: #endif
4026:   default:
4027:     error_msg_warning("gs_gop_vec() :: %c is not a valid op",op[0]);
4028:     error_msg_warning("gs_gop_vec() :: default :: plus");
4029:     gs_gop_vec_plus(gs,vals,step);
4030:     break;
4031:   }
4032: #ifdef DEBUG
4033:   error_msg_warning("gs_gop_vec() end");
4034: #endif
4035: }



4039: /******************************************************************************
4040: Function: gather_scatter

4042: Input : 
4043: Output: 
4044: Return: 
4045: Description: 
4046: ******************************************************************************/
4047: static void
4048: gs_gop_vec_plus(register gs_id *gs, register REAL *vals, register int step)
4049: {
4050: #ifdef DEBUG
4051:   error_msg_warning("gs_gop_vec_plus() start");
4052: #endif

4054:   if (!gs) {error_msg_fatal("gs_gop_vec() passed NULL gs handle!!!");}

4056:   /* local only operations!!! */
4057:   if (gs->num_local)
4058:     {gs_gop_vec_local_plus(gs,vals,step);}

4060:   /* if intersection tree/pairwise and local isn't empty */
4061:   if (gs->num_local_gop)
4062:     {
4063:       gs_gop_vec_local_in_plus(gs,vals,step);

4065:       /* pairwise */
4066:       if (gs->num_pairs)
4067:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

4069:       /* tree */
4070:       else if (gs->max_left_over)
4071:         {gs_gop_vec_tree_plus(gs,vals,step);}

4073:       gs_gop_vec_local_out(gs,vals,step);
4074:     }
4075:   /* if intersection tree/pairwise and local is empty */
4076:   else
4077:     {
4078:       /* pairwise */
4079:       if (gs->num_pairs)
4080:         {gs_gop_vec_pairwise_plus(gs,vals,step);}

4082:       /* tree */
4083:       else if (gs->max_left_over)
4084:         {gs_gop_vec_tree_plus(gs,vals,step);}
4085:     }
4086: #ifdef DEBUG
4087:   error_msg_warning("gs_gop_vec_plus() end");
4088: #endif
4089: }



4093: /******************************************************************************
4094: Function: gather_scatter

4096: Input : 
4097: Output: 
4098: Return: 
4099: Description: 
4100: ******************************************************************************/
4101: static
4102: void
4103: gs_gop_vec_local_plus(register gs_id *gs, register REAL *vals, 
4104:                       register int step)
4105: {
4106:   register int *num, *map, **reduce;
4107:   register REAL *base;


4110: #ifdef DEBUG
4111:   error_msg_warning("gs_gop_vec_local_plus() start");
4112: #endif

4114:   num    = gs->num_local_reduce;
4115:   reduce = gs->local_reduce;
4116:   while ((map = *reduce))
4117:     {
4118:       base = vals + map[0] * step;

4120:       /* wall */
4121:       if (*num == 2)
4122:         {
4123:           num++; reduce++;
4124:           rvec_add (base,vals+map[1]*step,step);
4125:           rvec_copy(vals+map[1]*step,base,step);
4126:         }
4127:       /* corner shared by three elements */
4128:       else if (*num == 3)
4129:         {
4130:           num++; reduce++;
4131:           rvec_add (base,vals+map[1]*step,step);
4132:           rvec_add (base,vals+map[2]*step,step);
4133:           rvec_copy(vals+map[2]*step,base,step);
4134:           rvec_copy(vals+map[1]*step,base,step);
4135:         }
4136:       /* corner shared by four elements */
4137:       else if (*num == 4)
4138:         {
4139:           num++; reduce++;
4140:           rvec_add (base,vals+map[1]*step,step);
4141:           rvec_add (base,vals+map[2]*step,step);
4142:           rvec_add (base,vals+map[3]*step,step);
4143:           rvec_copy(vals+map[3]*step,base,step);
4144:           rvec_copy(vals+map[2]*step,base,step);
4145:           rvec_copy(vals+map[1]*step,base,step);
4146:         }
4147:       /* general case ... odd geoms ... 3D */
4148:       else
4149:         {
4150:           num++;
4151:           while (*++map >= 0)
4152:             {rvec_add (base,vals+*map*step,step);}
4153: 
4154:           map = *reduce;
4155:           while (*++map >= 0)
4156:             {rvec_copy(vals+*map*step,base,step);}
4157: 
4158:           reduce++;
4159:         }
4160:     }
4161: #ifdef DEBUG
4162:   error_msg_warning("gs_gop_vec_local_plus() end");
4163: #endif
4164: }



4168: /******************************************************************************
4169: Function: gather_scatter

4171: Input : 
4172: Output: 
4173: Return: 
4174: Description: 
4175: ******************************************************************************/
4176: static
4177: void
4178: gs_gop_vec_local_in_plus(register gs_id *gs, register REAL *vals, 
4179:                          register int step)
4180: {
4181:   register int  *num, *map, **reduce;
4182:   register REAL *base;


4185: #ifdef DEBUG
4186:   error_msg_warning("gs_gop_vec_locel_in_plus() start");
4187: #endif

4189:   num    = gs->num_gop_local_reduce;
4190:   reduce = gs->gop_local_reduce;
4191:   while ((map = *reduce++))
4192:     {
4193:       base = vals + map[0] * step;

4195:       /* wall */
4196:       if (*num == 2)
4197:         {
4198:           num ++;
4199:           rvec_add(base,vals+map[1]*step,step);
4200:         }
4201:       /* corner shared by three elements */
4202:       else if (*num == 3)
4203:         {
4204:           num ++;
4205:           rvec_add(base,vals+map[1]*step,step);
4206:           rvec_add(base,vals+map[2]*step,step);
4207:         }
4208:       /* corner shared by four elements */
4209:       else if (*num == 4)
4210:         {
4211:           num ++;
4212:           rvec_add(base,vals+map[1]*step,step);
4213:           rvec_add(base,vals+map[2]*step,step);
4214:           rvec_add(base,vals+map[3]*step,step);
4215:         }
4216:       /* general case ... odd geoms ... 3D*/
4217:       else
4218:         {
4219:           num++;
4220:           while (*++map >= 0)
4221:             {rvec_add(base,vals+*map*step,step);}
4222:         }
4223:     }
4224: #ifdef DEBUG
4225:   error_msg_warning("gs_gop_vec_local_in_plus() end");
4226: #endif
4227: }


4230: /******************************************************************************
4231: Function: gather_scatter

4233: Input : 
4234: Output: 
4235: Return: 
4236: Description: 
4237: ******************************************************************************/
4238: static
4239: void
4240: gs_gop_vec_local_out(register gs_id *gs, register REAL *vals, 
4241:                      register int step)
4242: {
4243:   register int *num, *map, **reduce;
4244:   register REAL *base;


4247: #ifdef DEBUG
4248:   error_msg_warning("gs_gop_vec_local_out() start");
4249: #endif

4251:   num    = gs->num_gop_local_reduce;
4252:   reduce = gs->gop_local_reduce;
4253:   while ((map = *reduce++))
4254:     {
4255:       base = vals + map[0] * step;

4257:       /* wall */
4258:       if (*num == 2)
4259:         {
4260:           num ++;
4261:           rvec_copy(vals+map[1]*step,base,step);
4262:         }
4263:       /* corner shared by three elements */
4264:       else if (*num == 3)
4265:         {
4266:           num ++;
4267:           rvec_copy(vals+map[1]*step,base,step);
4268:           rvec_copy(vals+map[2]*step,base,step);
4269:         }
4270:       /* corner shared by four elements */
4271:       else if (*num == 4)
4272:         {
4273:           num ++;
4274:           rvec_copy(vals+map[1]*step,base,step);
4275:           rvec_copy(vals+map[2]*step,base,step);
4276:           rvec_copy(vals+map[3]*step,base,step);
4277:         }
4278:       /* general case ... odd geoms ... 3D*/
4279:       else
4280:         {
4281:           num++;
4282:           while (*++map >= 0)
4283:             {rvec_copy(vals+*map*step,base,step);}
4284:         }
4285:     }
4286: #ifdef DEBUG
4287:   error_msg_warning("gs_gop_vec_local_out() end");
4288: #endif
4289: }



4293: /******************************************************************************
4294: Function: gather_scatter

4296: VERSION 3 :: 

4298: Input : 
4299: Output: 
4300: Return: 
4301: Description: 
4302: ******************************************************************************/
4303: static
4304: void
4305: gs_gop_vec_pairwise_plus(register gs_id *gs, register REAL *in_vals,
4306:                          register int step)
4307: {
4308:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
4309:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
4310:   register int *pw, *list, *size, **nodes;
4311:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
4312:   MPI_Status status;


4315: #ifdef DEBUG
4316:   error_msg_warning("gs_gop_vec_pairwise_plus() start");
4317: #endif

4319:   /* strip and load registers */
4320:   msg_list =list         = gs->pair_list;
4321:   msg_size =size         = gs->msg_sizes;
4322:   msg_nodes=nodes        = gs->node_list;
4323:   iptr=pw                = gs->pw_elm_list;
4324:   dptr1=dptr3            = gs->pw_vals;
4325:   msg_ids_in  = ids_in   = gs->msg_ids_in;
4326:   msg_ids_out = ids_out  = gs->msg_ids_out;
4327:   dptr2                  = gs->out;
4328:   in1=in2                = gs->in;

4330:   /* post the receives */
4331:   /*  msg_nodes=nodes; */
4332:   do
4333:     {
4334:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
4335:          second one *list and do list++ afterwards */
4336:       MPI_Irecv(in1, *size *step, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
4337:                 gs->gs_comm, msg_ids_in++);
4338:       in1 += *size++ *step;
4339:     }
4340:   while (*++msg_nodes);
4341:   msg_nodes=nodes;

4343:   /* load gs values into in out gs buffers */
4344:   while (*iptr >= 0)
4345:     {
4346:       rvec_copy(dptr3,in_vals + *iptr*step,step);
4347:       dptr3+=step;
4348:       iptr++;
4349:     }

4351:   /* load out buffers and post the sends */
4352:   while ((iptr = *msg_nodes++))
4353:     {
4354:       dptr3 = dptr2;
4355:       while (*iptr >= 0)
4356:         {
4357:           rvec_copy(dptr2,dptr1 + *iptr*step,step);
4358:           dptr2+=step;
4359:           iptr++;
4360:         }
4361:       MPI_Isend(dptr3, *msg_size++ *step, REAL_TYPE, *msg_list++,
4362:                 MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
4363:     }

4365:   /* tree */
4366:   if (gs->max_left_over)
4367:     {gs_gop_vec_tree_plus(gs,in_vals,step);}

4369:   /* process the received data */
4370:   msg_nodes=nodes;
4371:   while ((iptr = *nodes++))
4372:     {
4373:       /* Should I check the return value of MPI_Wait() or status? */
4374:       /* Can this loop be replaced by a call to MPI_Waitall()? */
4375:       MPI_Wait(ids_in++, &status);
4376:       while (*iptr >= 0)
4377:         {
4378: #if defined BLAS||CBLAS
4379:           axpy(step,1.0,in2,1,dptr1 + *iptr*step,1);
4380: #else
4381:           rvec_add(dptr1 + *iptr*step,in2,step);
4382: #endif
4383:           in2+=step;
4384:           iptr++;
4385:         }
4386:     }

4388:   /* replace vals */
4389:   while (*pw >= 0)
4390:     {
4391:       rvec_copy(in_vals + *pw*step,dptr1,step);
4392:       dptr1+=step;
4393:       pw++;
4394:     }

4396:   /* clear isend message handles */
4397:   /* This changed for clarity though it could be the same */
4398:   while (*msg_nodes++)
4399:     /* Should I check the return value of MPI_Wait() or status? */
4400:     /* Can this loop be replaced by a call to MPI_Waitall()? */
4401:     {MPI_Wait(ids_out++, &status);}

4403: #ifdef DEBUG
4404:   error_msg_warning("gs_gop_vec_pairwise_plus() end");
4405: #endif

4407: }



4411: /******************************************************************************
4412: Function: gather_scatter

4414: Input : 
4415: Output: 
4416: Return: 
4417: Description: 
4418: ******************************************************************************/
4419: static
4420: void
4421: gs_gop_vec_tree_plus(register gs_id *gs, register REAL *vals, register int step)
4422: {
4423:   register int size, *in, *out;
4424:   register REAL *buf, *work;
4425:   int op[] = {GL_ADD,0};

4427: #ifdef DEBUG
4428:   error_msg_warning("start gs_gop_vec_tree_plus()");
4429: #endif

4431:   /* copy over to local variables */
4432:   in   = gs->tree_map_in;
4433:   out  = gs->tree_map_out;
4434:   buf  = gs->tree_buf;
4435:   work = gs->tree_work;
4436:   size = gs->tree_nel*step;

4438:   /* zero out collection buffer */
4439: #if defined  BLAS||CBLAS
4440:   *work = 0.0;
4441:   copy(size,work,0,buf,1);
4442: #else
4443:   rvec_zero(buf,size);
4444: #endif


4447:   /* copy over my contributions */
4448:   while (*in >= 0)
4449:     {
4450: #if defined  BLAS||CBLAS
4451:       copy(step,vals + *in++*step,1,buf + *out++*step,1);
4452: #else
4453:       rvec_copy(buf + *out++*step,vals + *in++*step,step);
4454: #endif
4455:     }

4457:   /* perform fan in/out on full buffer */
4458:   /* must change grop to handle the blas */
4459:   grop(buf,work,size,op);

4461:   /* reset */
4462:   in   = gs->tree_map_in;
4463:   out  = gs->tree_map_out;

4465:   /* get the portion of the results I need */
4466:   while (*in >= 0)
4467:     {
4468: #if defined  BLAS||CBLAS
4469:       copy(step,buf + *out++*step,1,vals + *in++*step,1);
4470: #else
4471:       rvec_copy(vals + *in++*step,buf + *out++*step,step);
4472: #endif
4473:     }

4475: #ifdef DEBUG
4476:   error_msg_warning("start gs_gop_vec_tree_plus()");
4477: #endif
4478: }



4482: /******************************************************************************
4483: Function: gather_scatter

4485: Input : 
4486: Output: 
4487: Return: 
4488: Description: 
4489: ******************************************************************************/
4490: void
4491: gs_gop_hc(register gs_id *gs, register REAL *vals, register const char *op, register int dim)
4492: {
4493: #ifdef DEBUG
4494:   error_msg_warning("gs_gop_hc() start\n");
4495:   if (!gs) {error_msg_fatal("gs_gop_vec() :: passed NULL gs handle!!!\n");}
4496:   if (!op) {error_msg_fatal("gs_gop_vec() :: passed NULL operation!!!\n");}
4497: #endif

4499:   switch (*op) {
4500:   case '+':
4501:     gs_gop_plus_hc(gs,vals,dim);
4502:     break;
4503: #ifdef NOT_YET    
4504:   case '*':
4505:     gs_gop_times(gs,vals);
4506:     break;
4507:   case 'a':
4508:     gs_gop_min_abs(gs,vals);
4509:     break;
4510:   case 'A':
4511:     gs_gop_max_abs(gs,vals);
4512:     break;
4513:   case 'e':
4514:     gs_gop_exists(gs,vals);
4515:     break;
4516:   case 'm':
4517:     gs_gop_min(gs,vals);
4518:     break;
4519:   case 'M':
4520:     gs_gop_max(gs,vals); break;
4521:     /*
4522:     if (*(op+1)=='\0')
4523:       {gs_gop_max(gs,vals); break;}
4524:     else if (*(op+1)=='X')
4525:       {gs_gop_max_abs(gs,vals); break;}
4526:     else if (*(op+1)=='N')
4527:       {gs_gop_min_abs(gs,vals); break;}
4528:     */
4529: #endif
4530:   default:
4531:     error_msg_warning("gs_gop_hc() :: %c is not a valid op",op[0]);
4532:     error_msg_warning("gs_gop_hc() :: default :: plus\n");
4533:     gs_gop_plus_hc(gs,vals,dim);
4534:     break;
4535:   }
4536: #ifdef DEBUG
4537:   error_msg_warning("gs_gop_hc() end\n");
4538: #endif
4539: }



4543: /******************************************************************************
4544: Function: gather_scatter

4546: Input : 
4547: Output: 
4548: Return: 
4549: Description: 
4550: ******************************************************************************/
4551: static void
4552: gs_gop_plus_hc(register gs_id *gs, register REAL *vals, int dim)
4553: {
4554: #ifdef DEBUG
4555:   error_msg_warning("start gs_gop_hc()\n");
4556:   if (!gs) {error_msg_fatal("gs_gop_hc() passed NULL gs handle!!!\n");}
4557: #endif

4559:   /* if there's nothing to do return */
4560:   if (dim<=0)
4561:     {return;}

4563:   /* can't do more dimensions then exist */
4564:   dim = PetscMin(dim,i_log2_num_nodes);

4566:   /* local only operations!!! */
4567:   if (gs->num_local)
4568:     {gs_gop_local_plus(gs,vals);}

4570:   /* if intersection tree/pairwise and local isn't empty */
4571:   if (gs->num_local_gop)
4572:     {
4573:       gs_gop_local_in_plus(gs,vals);

4575:       /* pairwise will do tree inside ... */
4576:       if (gs->num_pairs)
4577:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}

4579:       /* tree only */
4580:       else if (gs->max_left_over)
4581:         {gs_gop_tree_plus_hc(gs,vals,dim);}
4582: 
4583:       gs_gop_local_out(gs,vals);
4584:     }
4585:   /* if intersection tree/pairwise and local is empty */
4586:   else
4587:     {
4588:       /* pairwise will do tree inside */
4589:       if (gs->num_pairs)
4590:         {gs_gop_pairwise_plus_hc(gs,vals,dim);}
4591: 
4592:       /* tree */
4593:       else if (gs->max_left_over)
4594:         {gs_gop_tree_plus_hc(gs,vals,dim);}
4595:     }

4597: #ifdef DEBUG
4598:   error_msg_warning("end gs_gop_hc()\n");
4599: #endif
4600: }


4603: /******************************************************************************
4604: VERSION 3 :: 

4606: Input : 
4607: Output: 
4608: Return: 
4609: Description: 
4610: ******************************************************************************/
4611: static
4612: void
4613: gs_gop_pairwise_plus_hc(register gs_id *gs, register REAL *in_vals, int dim)
4614: {
4615:   register REAL *dptr1, *dptr2, *dptr3, *in1, *in2;
4616:   register int *iptr, *msg_list, *msg_size, **msg_nodes;
4617:   register int *pw, *list, *size, **nodes;
4618:   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
4619:   MPI_Status status;
4620:   int i, mask=1;

4622:   for (i=1; i<dim; i++)
4623:     {mask<<=1; mask++;}


4626: #ifdef DEBUG
4627:   error_msg_warning("gs_gop_pairwise_hc() start\n");
4628: #endif

4630:   /* strip and load registers */
4631:   msg_list =list         = gs->pair_list;
4632:   msg_size =size         = gs->msg_sizes;
4633:   msg_nodes=nodes        = gs->node_list;
4634:   iptr=pw                = gs->pw_elm_list;
4635:   dptr1=dptr3            = gs->pw_vals;
4636:   msg_ids_in  = ids_in   = gs->msg_ids_in;
4637:   msg_ids_out = ids_out  = gs->msg_ids_out;
4638:   dptr2                  = gs->out;
4639:   in1=in2                = gs->in;

4641:   /* post the receives */
4642:   /*  msg_nodes=nodes; */
4643:   do
4644:     {
4645:       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
4646:          second one *list and do list++ afterwards */
4647:       if ((my_id|mask)==(*list|mask))
4648:         {
4649:           MPI_Irecv(in1, *size, REAL_TYPE, MPI_ANY_SOURCE, MSGTAG1 + *list++,
4650:                     gs->gs_comm, msg_ids_in++);
4651:           in1 += *size++;
4652:         }
4653:       else
4654:         {list++; size++;}
4655:     }
4656:   while (*++msg_nodes);

4658:   /* load gs values into in out gs buffers */
4659:   while (*iptr >= 0)
4660:     {*dptr3++ = *(in_vals + *iptr++);}

4662:   /* load out buffers and post the sends */
4663:   msg_nodes=nodes;
4664:   list = msg_list;
4665:   while ((iptr = *msg_nodes++))
4666:     {
4667:       if ((my_id|mask)==(*list|mask))
4668:         {
4669:           dptr3 = dptr2;
4670:           while (*iptr >= 0)
4671:             {*dptr2++ = *(dptr1 + *iptr++);}
4672:           /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
4673:           /* is msg_ids_out++ correct? */
4674:           MPI_Isend(dptr3, *msg_size++, REAL_TYPE, *list++,
4675:                     MSGTAG1+my_id, gs->gs_comm, msg_ids_out++);
4676:         }
4677:       else
4678:         {list++; msg_size++;}
4679:     }

4681:   /* do the tree while we're waiting */
4682:   if (gs->max_left_over)
4683:     {gs_gop_tree_plus_hc(gs,in_vals,dim);}

4685:   /* process the received data */
4686:   msg_nodes=nodes;
4687:   list = msg_list;
4688:   while ((iptr = *nodes++))
4689:     {
4690:       if ((my_id|mask)==(*list|mask))
4691:         {
4692:           /* Should I check the return value of MPI_Wait() or status? */
4693:           /* Can this loop be replaced by a call to MPI_Waitall()? */
4694:           MPI_Wait(ids_in++, &status);
4695:           while (*iptr >= 0)
4696:             {*(dptr1 + *iptr++) += *in2++;}
4697:         }
4698:       list++;
4699:     }

4701:   /* replace vals */
4702:   while (*pw >= 0)
4703:     {*(in_vals + *pw++) = *dptr1++;}

4705:   /* clear isend message handles */
4706:   /* This changed for clarity though it could be the same */
4707:   while (*msg_nodes++)
4708:     {
4709:       if ((my_id|mask)==(*msg_list|mask))
4710:         {
4711:           /* Should I check the return value of MPI_Wait() or status? */
4712:           /* Can this loop be replaced by a call to MPI_Waitall()? */
4713:           MPI_Wait(ids_out++, &status);
4714:         }
4715:       msg_list++;
4716:     }

4718: #ifdef DEBUG
4719:   error_msg_warning("gs_gop_pairwise_hc() end\n");
4720: #endif

4722: }



4726: /******************************************************************************
4727: Function: gather_scatter

4729: Input : 
4730: Output: 
4731: Return: 
4732: Description: 
4733: ******************************************************************************/
4734: static
4735: void
4736: gs_gop_tree_plus_hc(gs_id *gs, REAL *vals, int dim)
4737: {
4738:   int size;
4739:   int *in, *out;
4740:   REAL *buf, *work;
4741:   int op[] = {GL_ADD,0};

4743: #ifdef DEBUG
4744:   error_msg_warning("start gs_gop_tree_plus_hc()\n");
4745: #endif
4746: 
4747:   in   = gs->tree_map_in;
4748:   out  = gs->tree_map_out;
4749:   buf  = gs->tree_buf;
4750:   work = gs->tree_work;
4751:   size = gs->tree_nel;

4753: #if defined  BLAS||CBLAS
4754:   *work = 0.0;
4755:   copy(size,work,0,buf,1);
4756: #else
4757:   rvec_zero(buf,size);
4758: #endif

4760:   while (*in >= 0)
4761:     {*(buf + *out++) = *(vals + *in++);}

4763:   in   = gs->tree_map_in;
4764:   out  = gs->tree_map_out;

4766:   grop_hc(buf,work,size,op,dim);

4768:   while (*in >= 0)
4769:     {*(vals + *in++) = *(buf + *out++);}

4771: #ifdef DEBUG
4772:   error_msg_warning("end gs_gop_tree_plus_hc()\n");
4773: #endif
4774: }