Actual source code: ex1k.kokkos.cxx
1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
3: /*
4: This is a simple test to measure the latency of MPI communication.
5: The test is run with two processes. The first process sends a message
6: to the second process, and after having received the message, the second
7: process sends a message back to the first process once. The is repeated
8: a number of times. The latency is defined as half time of the round-trip.
10: It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
12: Usage: mpirun -n 2 ./ex1k -mtype <type>
13: Other arguments have a default value that is also used in osu_latency.
15: Examples:
17: On Summit at OLCF:
18: jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k -mtype cuda
20: On Crusher at OLCF:
21: srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype hip
22: */
24: #include <petscsf.h>
25: #include <petscdevice.h>
26: #if defined(PETSC_HAVE_UNISTD_H)
27: #include <unistd.h>
28: #endif
30: #if defined(PETSC_HAVE_CUDA)
31: #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
32: #elif defined(PETSC_HAVE_HIP)
33: #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
34: #elif defined(PETSC_HAVE_KOKKOS)
35: #include <Kokkos_Core.hpp>
36: #define SyncDevice() Kokkos::fence()
37: #else
38: #define SyncDevice()
39: #endif
41: /* Same values as OSU microbenchmarks */
42: #define LAT_LOOP_SMALL 10000
43: #define LAT_SKIP_SMALL 100
44: #define LAT_LOOP_LARGE 1000
45: #define LAT_SKIP_LARGE 10
46: #define LARGE_MESSAGE_SIZE 8192
48: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype, size_t size, void **ptr)
49: {
50: PetscFunctionBegin;
51: if (PetscMemTypeHost(mtype)) {
52: #if defined(PETSC_HAVE_GETPAGESIZE)
53: PetscCall(posix_memalign(ptr, getpagesize(), size));
54: #else
55: PetscCall(PetscMalloc(size, ptr));
56: #endif
57: }
58: #if defined(PETSC_HAVE_CUDA)
59: else if (PetscMemTypeCUDA(mtype))
60: PetscCallCUDA(cudaMalloc(ptr, size));
61: #elif defined(PETSC_HAVE_HIP)
62: else if (PetscMemTypeHIP(mtype))
63: PetscCallHIP(hipMalloc(ptr, size));
64: #elif defined(PETSC_HAVE_SYCL)
65: else if (PetscMemTypeSYCL(mtype))
66: PetscCallCXX(*ptr = Kokkos::kokkos_malloc(size));
67: #endif
68: PetscFunctionReturn(PETSC_SUCCESS);
69: }
71: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype, void *ptr)
72: {
73: PetscFunctionBegin;
74: if (PetscMemTypeHost(mtype)) {
75: free(ptr);
76: }
77: #if defined(PETSC_HAVE_CUDA)
78: else if (PetscMemTypeCUDA(mtype))
79: PetscCallCUDA(cudaFree(ptr));
80: #elif defined(PETSC_HAVE_HIP)
81: else if (PetscMemTypeHIP(mtype))
82: PetscCallHIP(hipFree(ptr));
83: #elif defined(PETSC_HAVE_SYCL)
84: else if (PetscMemTypeSYCL(mtype))
85: PetscCallCXX(Kokkos::kokkos_free(ptr));
86: #endif
87: PetscFunctionReturn(PETSC_SUCCESS);
88: }
90: /* Free memory and set ptr to NULL when succeeded */
91: #define PetscFreeWithMemType(t, p) ((p) && (PetscFreeWithMemType_Private((t), (p)) || ((p) = NULL, 0)))
93: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype, void *dst, const void *src, size_t n)
94: {
95: PetscFunctionBegin;
96: if (PetscMemTypeHost(mtype)) PetscCall(PetscMemcpy(dst, src, n));
97: #if defined(PETSC_HAVE_CUDA)
98: else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
99: #elif defined(PETSC_HAVE_HIP)
100: else if (PetscMemTypeHIP(mtype)) PetscCallHIP(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
101: #elif defined(PETSC_HAVE_SYCL)
102: else if (PetscMemTypeSYCL(mtype)) {
103: Kokkos::View<char *> dstView((char *)dst, n);
104: Kokkos::View<const char *, Kokkos::HostSpace> srcView((const char *)src, n);
105: PetscCallCXX(Kokkos::deep_copy(dstView, srcView));
106: }
107: #endif
108: PetscFunctionReturn(PETSC_SUCCESS);
109: }
111: int main(int argc, char **argv)
112: {
113: PetscSF sf[64];
114: PetscLogDouble t_start = 0, t_end = 0, time[64];
115: PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10;
116: PetscInt maxn = 512 * 1024; /* max 4M bytes messages */
117: PetscSFNode *iremote;
118: PetscMPIInt rank, size;
119: PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
120: size_t msgsize;
121: PetscMemType mtype = PETSC_MEMTYPE_HOST;
122: char mstring[16] = {0};
123: PetscBool isCuda, isHip, isHost, isKokkos, set;
124: PetscInt skipSmall = -1, loopSmall = -1;
125: MPI_Op op = MPI_REPLACE;
127: PetscFunctionBeginUser;
128: PetscCall(PetscInitialize(&argc, &argv, NULL, help));
129: #if defined(PETSC_HAVE_CUDA)
130: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
131: #elif defined(PETSC_HAVE_HIP)
132: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
133: #endif
134: PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
135: PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
136: PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
138: PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
139: PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
140: PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
142: PetscCall(PetscMalloc1(maxn, &iremote));
143: PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
144: if (set) {
145: PetscCall(PetscStrcasecmp(mstring, "cuda", &isCuda));
146: PetscCall(PetscStrcasecmp(mstring, "hip", &isHip));
147: PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
148: PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
150: if (isHost) mtype = PETSC_MEMTYPE_HOST;
151: else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
152: else if (isHip) mtype = PETSC_MEMTYPE_HIP;
153: else if (isKokkos) {
154: mtype = PETSC_MEMTYPE_KOKKOS;
155: PetscCall(PetscKokkosInitializeCheck());
156: } else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
157: }
159: PetscCall(PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&rootdata));
160: PetscCall(PetscMallocWithMemType(mtype, sizeof(PetscScalar) * maxn, (void **)&leafdata));
162: PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
163: for (i = 0; i < maxn; i++) {
164: pbuf[i] = 123.0;
165: ebuf[i] = 456.0;
166: }
168: for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
169: PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
170: PetscCall(PetscSFSetFromOptions(sf[i]));
171: if (rank == 0) {
172: nroots = n;
173: nleaves = 0;
174: } else {
175: nroots = 0;
176: nleaves = n;
177: for (j = 0; j < nleaves; j++) {
178: iremote[j].rank = 0;
179: iremote[j].index = j;
180: }
181: }
182: PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
183: PetscCall(PetscSFSetUp(sf[i]));
184: }
186: if (loopSmall > 0) {
187: nskip = skipSmall;
188: niter = loopSmall;
189: } else {
190: nskip = LAT_SKIP_SMALL;
191: niter = LAT_LOOP_SMALL;
192: }
194: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
195: msgsize = sizeof(PetscScalar) * n;
196: PetscCall(PetscMemcpyFromHostWithMemType(mtype, rootdata, pbuf, msgsize));
197: PetscCall(PetscMemcpyFromHostWithMemType(mtype, leafdata, ebuf, msgsize));
199: if (msgsize > LARGE_MESSAGE_SIZE) {
200: nskip = LAT_SKIP_LARGE;
201: niter = LAT_LOOP_LARGE;
202: }
203: PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
205: for (i = 0; i < niter + nskip; i++) {
206: if (i == nskip) {
207: SyncDevice();
208: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
209: t_start = MPI_Wtime();
210: }
211: PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
212: PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
213: PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
214: PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
215: }
216: SyncDevice();
217: PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
218: t_end = MPI_Wtime();
219: time[j] = (t_end - t_start) * 1e6 / (niter * 2);
220: }
222: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
223: for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
224: PetscCall(PetscSFDestroy(&sf[j]));
225: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
226: }
228: PetscCall(PetscFree2(pbuf, ebuf));
229: PetscCall(PetscFreeWithMemType(mtype, rootdata));
230: PetscCall(PetscFreeWithMemType(mtype, leafdata));
231: PetscCall(PetscFree(iremote));
232: PetscCall(PetscFinalize());
233: return 0;
234: }
236: /**TEST
237: testset:
238: # use small numbers to make the test cheap
239: args: -maxn 4 -skipSmall 1 -loopSmall 1
240: filter: grep "DOES_NOT_EXIST"
241: output_file: output/empty.out
242: nsize: 2
244: test:
245: args: -mtype host
247: test:
248: requires: cuda
249: args: -mtype cuda
251: test:
252: requires: hip
253: args: -mtype hip
255: test:
256: requires: kokkos
257: args: -mtype kokkos
258: TEST**/