00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef COMPONENTS_TRIG_CPU_ALLREDUCE_RECDBL_TRIGGERED_H
00014 #define COMPONENTS_TRIG_CPU_ALLREDUCE_RECDBL_TRIGGERED_H
00015
00016 #include "sst/elements/portals4_sm/trig_cpu/application.h"
00017 #include "sst/elements/portals4_sm/trig_cpu/trig_cpu.h"
00018 #include "sst/elements/portals4_sm/trig_cpu/portals.h"
00019
00020 class allreduce_recdbl_triggered : public application {
00021 public:
00022 allreduce_recdbl_triggered(trig_cpu *cpu) : application(cpu), init(false), algo_count(0)
00023 {
00024 ptl = cpu->getPortalsHandle();
00025
00026 in_buf = 1;
00027 out_buf = 0;
00028 zero_buf = 0;
00029 }
00030
00031 bool
00032 operator()(Event *ev)
00033 {
00034 ptl_md_t md;
00035 ptl_me_t me;
00036
00037 crBegin();
00038
00039 if (!init) {
00040 my_levels = -1;
00041 for (adj = 0x1; adj <= num_nodes ; adj <<= 1) { my_levels++; } adj = adj >> 1;
00042 if (adj != num_nodes) {
00043 printf("recursive_doubling requires power of 2 nodes (%d)\n",
00044 num_nodes);
00045 exit(1);
00046 }
00047
00048 my_level_steps.resize(my_levels);
00049 my_level_ct_hs.resize(my_levels);
00050 my_level_me_hs.resize(my_levels);
00051 my_level_md_hs.resize(my_levels);
00052
00053 for (i = 0 ; i < my_levels ; ++i) {
00054 my_level_steps[i] = 0;
00055 ptl->PtlCTAlloc(PTL_CT_OPERATION, my_level_ct_hs[i]);
00056
00057 me.start = &my_level_steps[i];
00058 me.length = 8;
00059 me.match_bits = i;
00060 me.ignore_bits = 0;
00061 me.ct_handle = my_level_ct_hs[i];
00062 ptl->PtlMEAppend(0, me, PTL_PRIORITY_LIST, NULL,
00063 my_level_me_hs[i]);
00064
00065 md.start = &my_level_steps[i];
00066 me.length = 8;
00067 md.eq_handle = PTL_EQ_NONE;
00068 md.ct_handle = PTL_CT_NONE;
00069 ptl->PtlMDBind(md, &my_level_md_hs[i]);
00070 }
00071
00072 md.start = &zero_buf;
00073 md.length = 8;
00074 md.eq_handle = PTL_EQ_NONE;
00075 md.ct_handle = PTL_CT_NONE;
00076 ptl->PtlMDBind(md, &zero_md_h);
00077
00078 init = true;
00079 crReturn();
00080 start_noise_section();
00081 }
00082
00083 crReturn();
00084
00085 start_time = cpu->getCurrentSimTimeNano();
00086 cpu->addBusyTime("200ns");
00087 crReturn();
00088
00089 out_buf = in_buf;
00090
00091
00092
00093
00094 ptl->PtlCTAlloc(PTL_CT_OPERATION, user_ct_h);
00095 me.start = &out_buf;
00096 me.length = 8;
00097 me.ignore_bits = ~0x0;
00098 me.ct_handle = user_ct_h;
00099 ptl->PtlMEAppend(1, me, PTL_PRIORITY_LIST, NULL, user_me_h);
00100
00101 md.start = &out_buf;
00102 md.length = 8;
00103 md.eq_handle = PTL_EQ_NONE;
00104 md.ct_handle = PTL_CT_NONE;
00105 ptl->PtlMDBind(md, &user_md_h);
00106
00107
00108
00109
00110
00111 ptl->PtlAtomic(user_md_h, 0, 8, 0, my_id, 0, 0, 0, NULL, 0, PTL_SUM, PTL_LONG);
00112 crReturn();
00113 ptl->PtlAtomic(user_md_h, 0, 8, 0, my_id ^ 0x1, 0, 0, 0, NULL, 0, PTL_SUM, PTL_LONG);
00114 crReturn();
00115
00116 ptl->PtlEnableCoalesce();
00117 crReturn();
00118
00119 for (i = 1 ; i < my_levels ; ++i) {
00120 next_level = 0x1 << i;
00121 remote = my_id ^ next_level;
00122 ptl->PtlTriggeredAtomic(my_level_md_hs[i - 1], 0, 8, 0, my_id, 0,
00123 i, 0, NULL, 0, PTL_SUM, PTL_LONG,
00124 my_level_ct_hs[i - 1], algo_count * 3 + 2);
00125 crReturn();
00126 ptl->PtlTriggeredAtomic(my_level_md_hs[i - 1], 0, 8, 0, remote, 0,
00127 i, 0, NULL, 0, PTL_SUM, PTL_LONG,
00128 my_level_ct_hs[i - 1], algo_count * 3 + 2);
00129 crReturn();
00130 ptl->PtlTriggeredAtomic(zero_md_h, 0, 8, 0, my_id, 0,
00131 i - 1, 0, NULL, 0, PTL_LAND, PTL_LONG,
00132 my_level_ct_hs[i - 1], algo_count * 3 + 2);
00133 crReturn();
00134 }
00135
00136
00137 ptl->PtlTriggeredPut(my_level_md_hs[my_levels - 1], 0, 8, 0, my_id, 1,
00138 0, 0, NULL, 0, my_level_ct_hs[my_levels - 1], algo_count * 3 + 2);
00139 crReturn();
00140 ptl->PtlTriggeredAtomic(zero_md_h, 0, 8, 0, my_id, 0,
00141 my_levels - 1, 0, NULL, 0, PTL_LAND, PTL_LONG,
00142 my_level_ct_hs[my_levels - 1], algo_count * 3 + 2);
00143 crReturn();
00144
00145 ptl->PtlDisableCoalesce();
00146 crReturn();
00147
00148 while (!ptl->PtlCTWait(user_ct_h, 1)) { crReturn(); }
00149 while (!ptl->PtlCTWait(my_level_ct_hs[my_levels - 1], algo_count * 3 + 3)) { crReturn(); }
00150
00151 ptl->PtlMEUnlink(user_me_h);
00152 crReturn();
00153 ptl->PtlCTFree(user_ct_h);
00154 crReturn();
00155 algo_count++;
00156
00157 trig_cpu::addTimeToStats(cpu->getCurrentSimTimeNano()-start_time);
00158
00159 if (out_buf != (uint64_t) num_nodes) {
00160 printf("%05d: got %lu, expected %lu\n",
00161 my_id, (unsigned long) out_buf, (unsigned long) num_nodes);
00162 }
00163
00164 crFinish();
00165 return true;
00166 }
00167
00168 private:
00169 allreduce_recdbl_triggered();
00170 allreduce_recdbl_triggered(const application& a);
00171 void operator=(allreduce_recdbl_triggered const&);
00172
00173 portals *ptl;
00174 SimTime_t start_time;
00175 int i;
00176 int my_levels;
00177 bool init;
00178
00179 std::vector<uint64_t> my_level_steps;
00180 std::vector<ptl_handle_ct_t> my_level_ct_hs;
00181 std::vector<ptl_handle_me_t> my_level_me_hs;
00182 std::vector<ptl_handle_md_t> my_level_md_hs;
00183
00184 ptl_handle_ct_t user_ct_h;
00185 ptl_handle_me_t user_me_h;
00186 ptl_handle_md_t user_md_h;
00187
00188 ptl_handle_md_t zero_md_h;
00189
00190 int adj;
00191 int next_level;
00192 int remote;
00193
00194 uint64_t in_buf, out_buf, tmp_buf, zero_buf;
00195
00196 uint64_t algo_count;
00197 };
00198
00199 #endif // COMPONENTS_TRIG_CPU_ALLREDUCE_RECDBL_TRIGGERED_H