Actual source code: ex1k.kokkos.cxx

  1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";

  3: /*
  4:   This is a simple test to measure the latency of MPI communication.
  5:   The test is run with two processes.  The first process sends a message
  6:   to the second process, and after having received the message, the second
  7:   process sends a message back to the first process once.  The is repeated
  8:   a number of times.  The latency is defined as half time of the round-trip.

 10:   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).

 12:   Usage: mpirun -n 2 ./ex1k -mtype <type>
 13:   Other arguments have a default value that is also used in osu_latency.

 15:   Examples:

 17:   On Summit at OLCF:
 18:     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k  -mtype cuda

 20:   On Crusher at OLCF:
 21:     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype hip
 22: */

 24: #include <petscsf.h>
 25: #include <petscdevice.h>
 26: #if defined(PETSC_HAVE_UNISTD_H)
 27:   #include <unistd.h>
 28: #endif

 30: #if defined(PETSC_HAVE_CUDA)
 31:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 32: #elif defined(PETSC_HAVE_HIP)
 33:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 34: #elif defined(PETSC_HAVE_KOKKOS)
 35:   #include <Kokkos_Core.hpp>
 36:   #define SyncDevice() Kokkos::fence()
 37: #else
 38:   #define SyncDevice()
 39: #endif

 41: /* Same values as OSU microbenchmarks */
 42: #define LAT_LOOP_SMALL     10000
 43: #define LAT_SKIP_SMALL     100
 44: #define LAT_LOOP_LARGE     1000
 45: #define LAT_SKIP_LARGE     10
 46: #define LARGE_MESSAGE_SIZE 8192

 48: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype, size_t size, void **ptr)
 49: {
 50:   PetscFunctionBegin;
 51:   if (PetscMemTypeHost(mtype)) {
 52: #if defined(PETSC_HAVE_GETPAGESIZE)
 53:     PetscCall(posix_memalign(ptr, getpagesize(), size));
 54: #else
 55:     PetscCall(PetscMalloc(size, ptr));
 56: #endif
 57:   }
 58: #if defined(PETSC_HAVE_CUDA)
 59:   else if (PetscMemTypeCUDA(mtype))
 60:     PetscCallCUDA(cudaMalloc(ptr, size));
 61: #elif defined(PETSC_HAVE_HIP)
 62:   else if (PetscMemTypeHIP(mtype))
 63:     PetscCallHIP(hipMalloc(ptr, size));
 64: #elif defined(PETSC_HAVE_SYCL)
 65:   else if (PetscMemTypeSYCL(mtype))
 66:     PetscCallCXX(*ptr = Kokkos::kokkos_malloc(size));
 67: #endif
 68:   PetscFunctionReturn(PETSC_SUCCESS);
 69: }

 71: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype, void *ptr)
 72: {
 73:   PetscFunctionBegin;
 74:   if (PetscMemTypeHost(mtype)) {
 75:     free(ptr);
 76:   }
 77: #if defined(PETSC_HAVE_CUDA)
 78:   else if (PetscMemTypeCUDA(mtype))
 79:     PetscCallCUDA(cudaFree(ptr));
 80: #elif defined(PETSC_HAVE_HIP)
 81:   else if (PetscMemTypeHIP(mtype))
 82:     PetscCallHIP(hipFree(ptr));
 83: #elif defined(PETSC_HAVE_SYCL)
 84:   else if (PetscMemTypeSYCL(mtype))
 85:     PetscCallCXX(Kokkos::kokkos_free(ptr));
 86: #endif
 87:   PetscFunctionReturn(PETSC_SUCCESS);
 88: }

 90: /* Free memory and set ptr to NULL when succeeded */
 91: #define PetscFreeWithMemType(t, p) ((p) && (PetscFreeWithMemType_Private((t), (p)) || ((p) = NULL, 0)))

 93: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype, void *dst, const void *src, size_t n)
 94: {
 95:   PetscFunctionBegin;
 96:   if (PetscMemTypeHost(mtype)) PetscCall(PetscMemcpy(dst, src, n));
 97: #if defined(PETSC_HAVE_CUDA)
 98:   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 99: #elif defined(PETSC_HAVE_HIP)
100:   else if (PetscMemTypeHIP(mtype)) PetscCallHIP(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
101: #elif defined(PETSC_HAVE_SYCL)
102:   else if (PetscMemTypeSYCL(mtype)) {
103:     Kokkos::View<char *>                          dstView((char *)dst, n);
104:     Kokkos::View<const char *, Kokkos::HostSpace> srcView((const char *)src, n);
105:     PetscCallCXX(Kokkos::deep_copy(dstView, srcView));
106:   }
107: #endif
108:   PetscFunctionReturn(PETSC_SUCCESS);
109: }

111: int main(int argc, char **argv)
112: {
113:   PetscSF        sf[64];
114:   PetscLogDouble t_start = 0, t_end = 0, time[64];
115:   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
116:   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
117:   PetscSFNode   *iremote;
118:   PetscMPIInt    rank, size;
119:   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
120:   size_t         msgsize;
121:   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
122:   char           mstring[16] = {0};
123:   PetscBool      isCuda, isHip, isHost, isKokkos, set;
124:   PetscInt       skipSmall = -1, loopSmall = -1;
125:   MPI_Op         op = MPI_REPLACE;

127:   PetscFunctionBeginUser;
128:   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
129: #if defined(PETSC_HAVE_CUDA)
130:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
131: #elif defined(PETSC_HAVE_HIP)
132:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
133: #endif
134:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
135:   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
136:   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");

138:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
139:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
140:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));

142:   PetscCall(PetscMalloc1(maxn, &iremote));
143:   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
144:   if (set) {
145:     PetscCall(PetscStrcasecmp(mstring, "cuda", &isCuda));
146:     PetscCall(PetscStrcasecmp(mstring, "hip", &isHip));
147:     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
148:     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));

150:     if (isHost) mtype = PETSC_MEMTYPE_HOST;
151:     else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
152:     else if (isHip) mtype = PETSC_MEMTYPE_HIP;
153:     else if (isKokkos) {
154:       mtype = PETSC_MEMTYPE_KOKKOS;
155:       PetscCall(PetscKokkosInitializeCheck());
156:     } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
157:   }

159:   PetscCall(PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&rootdata));
160:   PetscCall(PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&leafdata));

162:   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
163:   for (i = 0; i < maxn; i++) {
164:     pbuf[i] = 123.0;
165:     ebuf[i] = 456.0;
166:   }

168:   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
169:     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
170:     PetscCall(PetscSFSetFromOptions(sf[i]));
171:     if (rank == 0) {
172:       nroots  = n;
173:       nleaves = 0;
174:     } else {
175:       nroots  = 0;
176:       nleaves = n;
177:       for (j = 0; j < nleaves; j++) {
178:         iremote[j].rank  = 0;
179:         iremote[j].index = j;
180:       }
181:     }
182:     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
183:     PetscCall(PetscSFSetUp(sf[i]));
184:   }

186:   if (loopSmall > 0) {
187:     nskip = skipSmall;
188:     niter = loopSmall;
189:   } else {
190:     nskip = LAT_SKIP_SMALL;
191:     niter = LAT_LOOP_SMALL;
192:   }

194:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
195:     msgsize = sizeof(PetscScalar) * n;
196:     PetscCall(PetscMemcpyFromHostWithMemType(mtype, rootdata, pbuf, msgsize));
197:     PetscCall(PetscMemcpyFromHostWithMemType(mtype, leafdata, ebuf, msgsize));

199:     if (msgsize > LARGE_MESSAGE_SIZE) {
200:       nskip = LAT_SKIP_LARGE;
201:       niter = LAT_LOOP_LARGE;
202:     }
203:     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));

205:     for (i = 0; i < niter + nskip; i++) {
206:       if (i == nskip) {
207:         SyncDevice();
208:         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
209:         t_start = MPI_Wtime();
210:       }
211:       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
212:       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
213:       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
214:       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
215:     }
216:     SyncDevice();
217:     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
218:     t_end   = MPI_Wtime();
219:     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
220:   }

222:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
223:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
224:     PetscCall(PetscSFDestroy(&sf[j]));
225:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
226:   }

228:   PetscCall(PetscFree2(pbuf, ebuf));
229:   PetscCall(PetscFreeWithMemType(mtype, rootdata));
230:   PetscCall(PetscFreeWithMemType(mtype, leafdata));
231:   PetscCall(PetscFree(iremote));
232:   PetscCall(PetscFinalize());
233:   return 0;
234: }

236: /**TEST
237:   testset:
238:     # use small numbers to make the test cheap
239:     args: -maxn 4 -skipSmall 1 -loopSmall 1
240:     filter: grep "DOES_NOT_EXIST"
241:     output_file: output/empty.out
242:     nsize: 2

244:     test:
245:       args: -mtype host

247:     test:
248:       requires: cuda
249:       args: -mtype cuda

251:     test:
252:       requires: hip
253:       args: -mtype hip

255:     test:
256:       requires: kokkos
257:       args: -mtype kokkos
258: TEST**/