Actual source code: cupminterface.hpp

  1: #ifndef PETSCCUPMINTERFACE_HPP
  2: #define PETSCCUPMINTERFACE_HPP

  4: #if defined(__cplusplus)
  5: #include <petsc/private/cpputil.hpp>
  6: #include <petsc/private/petscadvancedmacros.h>
  7: #include <petscdevice_cupm.h>

  9:   #include <array>

 11: namespace Petsc
 12: {

 14: namespace device
 15: {

 17: namespace cupm
 18: {

 20: // enum describing available cupm devices, this is used as the template parameter to any
 21: // class subclassing the Interface or using it as a member variable
 22: enum class DeviceType : int {
 23:   CUDA,
 24:   HIP
 25: };

 27: static constexpr std::array<const char *const, 5> DeviceTypes = {"cuda", "hip", "Petsc::Device::CUPM::DeviceType", "Petsc::Device::CUPM::DeviceType::", nullptr};

 29: namespace impl
 30: {

 32:   // A backend agnostic PetscCallCUPM() function, this will only work inside the member
 33:   // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
 34:   // functions can also be wrapped inline:
 35:   //
 36:   // foo<int,char,bool>();
 37:   #define PetscCallCUPM(...) \
 38:     do { \
 39:       const cupmError_t cerr_p_ = __VA_ARGS__; \
 41:     } while (0)

 43:   #define PetscCallCUPMAbort(comm_, ...) \
 44:     do { \
 45:       const cupmError_t cerr_p_ = __VA_ARGS__; \
 47:     } while (0)

 49:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT() - declaration to alias a cuda/hip integral constant
 50:   // value
 51:   //
 52:   // input params:
 53:   // our_prefix   - the prefix of the alias
 54:   // our_suffix   - the suffix of the alias
 55:   // their_prefix - the prefix of the variable being aliased
 56:   // their_suffix - the suffix of the variable being aliased
 57:   //
 58:   // example usage:
 59:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,cuda,AllGood); ->
 60:   // static const auto cupmSuccess = cudaAllGood;
 61:   //
 62:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm,Success,hip,AllRight); ->
 63:   // static const auto cupmSuccess = hipAllRight;
 64:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) static const auto PetscConcat(our_prefix, our_suffix) = PetscConcat(their_prefix, their_suffix)

 66:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON() - declaration to alias a cuda/hip integral constant
 67:   // value
 68:   //
 69:   // input params:
 70:   // our_suffix   - the suffix of the alias
 71:   // their_suffix - the suffix of the variable being aliased
 72:   //
 73:   // notes:
 74:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
 75:   //
 76:   // example usage:
 77:   // #define PETSC_CUPM_PREFIX_L cuda
 78:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllGood); ->
 79:   // static const auto cupmSuccess = cudaAllGood;
 80:   //
 81:   // #define PETSC_CUPM_PREFIX_L hip
 82:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(Success,AllRight); ->
 83:   // static const auto cupmSuccess = hipAllRight;
 84:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)

 86:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE() - declaration to alias a cuda/hip integral constant value
 87:   //
 88:   // input param:
 89:   // suffix - the common suffix shared between cuda, hip, and cupm
 90:   //
 91:   // notes:
 92:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix
 93:   //
 94:   // example usage:
 95:   // #define PETSC_CUPM_PREFIX_L cuda
 96:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = cudaSuccess;
 97:   //
 98:   // #define PETSC_CUPM_PREFIX_L hip
 99:   // PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success); -> static const auto cupmSuccess = hipSuccess;
100:   #define PETSC_CUPM_ALIAS_INTEGRAL_VALUE(suffix) PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(suffix, suffix)

102:   // PETSC_CUPM_ALIAS_FUNCTION_EXACT() - declaration to alias a cuda/hip function
103:   //
104:   // input params:
105:   // our_prefix   - the prefix of the alias
106:   // our_suffix   - the suffix of the alias
107:   // their_prefix - the prefix of the function being aliased
108:   // their_suffix - the suffix of the function being aliased
109:   //
110:   // notes:
111:   // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
112:   //
113:   // example usage:
114:   // PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm,Malloc,cuda,Malloc) ->
115:   // template <typename... T>
116:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
117:   // {
118:   //   return cudaMalloc(std::forward<T>(args)...);
119:   // }
120:   #define PETSC_CUPM_ALIAS_FUNCTION_EXACT(our_prefix, our_suffix, their_prefix, their_suffix) PETSC_ALIAS_FUNCTION(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix))

122:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON() - declaration to alias a cuda/hip function
123:   //
124:   // input params:
125:   // our_suffix   - the suffix of the alias
126:   // their_suffix - the common suffix of the cuda/hip function being aliased
127:   //
128:   // notes:
129:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
130:   // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
131:   //
132:   // example usage:
133:   // #define PETSC_CUPM_PREFIX_L cuda
134:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
135:   // template <typename... T>
136:   // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
137:   // {
138:   //   return cudaMalloc(std::forward<T>(args)...);
139:   // }
140:   //
141:   // #define PETSC_CUPM_PREFIX_L hip
142:   // PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocFancy,Malloc) ->
143:   // template <typename... T>
144:   // static constexpr auto cupmMallocFancy(T&&... args) *noexcept and trailing return type deduction*
145:   // {
146:   //   return hipMalloc(std::forward<T>(args)...);
147:   // }
148:   #define PETSC_CUPM_ALIAS_FUNCTION_COMMON(our_suffix, their_suffix) PETSC_CUPM_ALIAS_FUNCTION_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix)

150:   // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
151:   //
152:   // input param:
153:   // suffix - the common suffix for hip, cuda and the alias
154:   //
155:   // notes:
156:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
157:   // aliased. see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
158:   //
159:   // example usage:
160:   // #define PETSC_CUPM_PREFIX_L cuda
161:   // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
162:   // template <typename... T>
163:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
164:   // {
165:   //   return cudaMalloc(std::forward<T>(args)...);
166:   // }
167:   //
168:   // #define PETSC_CUPM_PREFIX_L hip
169:   // PETSC_CUPM_ALIAS_FUNCTION(Malloc) ->
170:   // template <typename... T>
171:   // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
172:   // {
173:   //   return hipMalloc(std::forward<T>(args)...);
174:   // }
175:   #define PETSC_CUPM_ALIAS_FUNCTION(suffix) PETSC_CUPM_ALIAS_FUNCTION_COMMON(suffix, suffix)

177:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT() - declaration to alias a cuda/hip function but
178:   // discard the last N arguments
179:   //
180:   // input params:
181:   // our_prefix   - the prefix of the alias
182:   // our_suffix   - the suffix of the alias
183:   // their_prefix - the prefix of the function being aliased
184:   // their_suffix - the suffix of the function being aliased
185:   // N            - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
186:   //
187:   // notes:
188:   // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
189:   //
190:   // example use:
191:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm,MallocAsync,cuda,Malloc,1) ->
192:   // template <typename... T, typename Tend>
193:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
194:   // return type deduction*
195:   // {
196:   //   (void)argend;
197:   //   return cudaMalloc(std::forward<T>(args)...);
198:   // }
199:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(our_prefix, our_suffix, their_prefix, their_suffix, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static PetscConcat(our_prefix, our_suffix), PetscConcat(their_prefix, their_suffix), N)

201:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON() - declaration to alias a cuda/hip function but
202:   // discard the last N arguments
203:   //
204:   // input params:
205:   // our_suffix   - the suffix of the alias
206:   // their_suffix - the suffix of the function being aliased
207:   // N            - integer constant [0,INT_MAX) dictating how many arguments to chop off the end
208:   //
209:   // notes:
210:   // requires PETSC_CUPM_PREFIX_L to be defined to the specific prefix of the function being
211:   // aliased. see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the
212:   // expansion
213:   //
214:   // example use:
215:   // #define PETSC_CUPM_PREFIX_L cuda
216:   // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync,Malloc,1) ->
217:   // template <typename... T, typename Tend>
218:   // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
219:   // return type deduction*
220:   // {
221:   //   (void)argend;
222:   //   return cudaMalloc(std::forward<T>(args)...);
223:   // }
224:   #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(our_suffix, their_suffix, N) PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_EXACT(cupm, our_suffix, PETSC_CUPM_PREFIX_L, their_suffix, N)

226: // Base class that holds functions and variables that don't require CUDA or HIP to be present
227: // on the system
228: template <DeviceType T>
229: struct InterfaceBase {
230:   static const DeviceType type = T;

232:   PETSC_CXX_COMPAT_DECL(constexpr const char *cupmName())
233:   {
234:     static_assert(util::integral_value(DeviceType::CUDA) == 0, "");
235:     static_assert(util::integral_value(DeviceType::HIP) == 1, "");
236:     return std::get<util::integral_value(T)>(DeviceTypes);
237:   }

239:   PETSC_CXX_COMPAT_DECL(constexpr auto PETSC_DEVICE_CUPM())
240:   PETSC_DECLTYPE_AUTO_RETURNS(T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP)

242:   PETSC_CXX_COMPAT_DECL(constexpr auto PETSC_MEMTYPE_CUPM())
243:   PETSC_DECLTYPE_AUTO_RETURNS(T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP)
244: };

246: // declare the base class static member variables
247: template <DeviceType T>
248: const DeviceType InterfaceBase<T>::type;

250:   #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
251:     using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
252:     using base_name::type; \
253:     using base_name::cupmName; \
254:     using base_name::PETSC_DEVICE_CUPM; \
255:     using base_name::PETSC_MEMTYPE_CUPM

257: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
258: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
259: // import them into classes.
260: template <DeviceType>
261: struct InterfaceImpl;

263:   #if PetscDefined(HAVE_CUDA)
264:     #define PETSC_CUPM_PREFIX_L cuda
265:     #define PETSC_CUPM_PREFIX_U CUDA
266: template <>
267: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
268:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);

270:   // typedefs
271:   using cupmError_t             = cudaError_t;
272:   using cupmEvent_t             = cudaEvent_t;
273:   using cupmStream_t            = cudaStream_t;
274:   using cupmDeviceProp_t        = cudaDeviceProp;
275:   using cupmMemcpyKind_t        = cudaMemcpyKind;
276:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
277:   using cupmPointerAttributes_t = struct cudaPointerAttributes;
278:   using cupmMemoryType_t        = enum cudaMemoryType;
279:   using cupmDim3                = dim3;
280:   using cupmHostFn_t            = cudaHostFn_t;
281:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
282:   using cupmMemPool_t   = cudaMemPool_t;
283:   using cupmMemPoolAttr = cudaMemPoolAttr;
284:     #else
285:   using cupmMemPool_t   = void *;
286:   using cupmMemPoolAttr = unsigned int;
287:     #endif

289:   // values
290:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
291:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
292:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorDeviceAlreadyInUse);
293:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
294:     #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
295:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorStubLibrary);
296:     #else
297:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
298:     #endif
299:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
300:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
301:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
302:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
303:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
304:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
305:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
306:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
307:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
308:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
309:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
310:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeManaged);
311:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
312:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocDefault);
313:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(HostAllocWriteCombined);
314:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
315:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
316:     #else
317:   static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
318:     #endif

320:   // error functions
321:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
322:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
323:   PETSC_CUPM_ALIAS_FUNCTION(GetLastError)

325:   // device management
326:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
327:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
328:   PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
329:   PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
330:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
331:   PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
332:   PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
333:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
334:   PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
335:   PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
336:     #else
337:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int))
338:   {
339:     *pool = nullptr;
340:     return cupmSuccess;
341:   }

343:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *)) { return cupmSuccess; }
344:     #endif
345:   // CUDA has no cudaInit() to match hipInit()
346:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmInit(unsigned int)) { return cudaFree(nullptr); }

348:   // stream management
349:   PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
350:   PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
351:   PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
352:   PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
353:   PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
354:   PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
355:   PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
356:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
357:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
358:   PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
359:   PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
360:   PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
361:   PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
362:   PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
363:   PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
364:   PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)

366:   // memory management
367:   PETSC_CUPM_ALIAS_FUNCTION(Free)
368:   PETSC_CUPM_ALIAS_FUNCTION(Malloc)
369:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
370:   PETSC_CUPM_ALIAS_FUNCTION(FreeAsync)
371:   PETSC_CUPM_ALIAS_FUNCTION(MallocAsync)
372:     #else
373:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
374:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
375:     #endif
376:   PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
377:   PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
378:   PETSC_CUPM_ALIAS_FUNCTION(MallocHost)
379:   PETSC_CUPM_ALIAS_FUNCTION(FreeHost)
380:   PETSC_CUPM_ALIAS_FUNCTION(Memset)
381:     #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
382:   PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)
383:     #else
384:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MemsetAsync, Memset, 1)
385:     #endif

387:   // launch control
388:   PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc)
389:   template <typename FunctionT, typename... KernelArgsT>
390:   PETSC_CXX_COMPAT_DECL(cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs))
391:   {
392:     void *args[] = {(void *)&kernelArgs...};
393:     return cudaLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
394:   }
395: };
396:     #undef PETSC_CUPM_PREFIX_L
397:     #undef PETSC_CUPM_PREFIX_U
398:   #endif // PetscDefined(HAVE_CUDA)

400:   #if PetscDefined(HAVE_HIP)
401:     #define PETSC_CUPM_PREFIX_L hip
402:     #define PETSC_CUPM_PREFIX_U HIP
403: template <>
404: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
405:   PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);

407:   // typedefs
408:   using cupmError_t             = hipError_t;
409:   using cupmEvent_t             = hipEvent_t;
410:   using cupmStream_t            = hipStream_t;
411:   using cupmDeviceProp_t        = hipDeviceProp_t;
412:   using cupmMemcpyKind_t        = hipMemcpyKind;
413:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
414:   using cupmPointerAttributes_t = hipPointerAttribute_t;
415:   using cupmMemoryType_t        = enum hipMemoryType;
416:   using cupmDim3                = dim3;
417:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
418:   using cupmHostFn_t    = hipHostFn_t;
419:   using cupmMemPool_t   = hipMemPool_t;
420:   using cupmMemPoolAttr = hipMemPoolAttr;
421:     #else
422:   using cupmHostFn_t                                           = void (*)(void *);
423:   using cupmMemPool_t                                          = void *;
424:   using cupmMemPoolAttr                                        = unsigned int;
425:     #endif

427:   // values
428:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(Success);
429:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNotReady);
430:   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
431:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorDeviceAlreadyInUse, ErrorContextAlreadyInUse);
432:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorSetOnActiveProcess);
433:   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
434:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(ErrorStubLibrary, ErrorInsufficientDriver);
435:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(ErrorNoDevice);
436:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamDefault);
437:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(StreamNonBlocking);
438:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(DeviceMapHost);
439:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToDevice);
440:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToHost);
441:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDeviceToDevice);
442:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyHostToHost);
443:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemcpyDefault);
444:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeHost);
445:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemoryTypeDevice);
446:   // see
447:   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
448:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(MemoryTypeManaged, MemoryTypeUnified);
449:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(EventDisableTiming);
450:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocDefault, HostMallocDefault);
451:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE_COMMON(HostAllocWriteCombined, HostMallocWriteCombined);
452:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
453:   PETSC_CUPM_ALIAS_INTEGRAL_VALUE(MemPoolAttrReleaseThreshold);
454:     #else
455:   static const cupmMemPoolAttr cupmMemPoolAttrReleaseThreshold = 0;
456:     #endif

458:   // error functions
459:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorName)
460:   PETSC_CUPM_ALIAS_FUNCTION(GetErrorString)
461:   PETSC_CUPM_ALIAS_FUNCTION(GetLastError)

463:   // device management
464:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceCount)
465:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceProperties)
466:   PETSC_CUPM_ALIAS_FUNCTION(GetDevice)
467:   PETSC_CUPM_ALIAS_FUNCTION(SetDevice)
468:   PETSC_CUPM_ALIAS_FUNCTION(GetDeviceFlags)
469:   PETSC_CUPM_ALIAS_FUNCTION(SetDeviceFlags)
470:   PETSC_CUPM_ALIAS_FUNCTION(PointerGetAttributes)
471:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
472:   PETSC_CUPM_ALIAS_FUNCTION(DeviceGetMemPool)
473:   PETSC_CUPM_ALIAS_FUNCTION(MemPoolSetAttribute)
474:     #else
475:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int))
476:   {
477:     *pool = nullptr;
478:     return cupmSuccess;
479:   }

481:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *)) { return cupmSuccess; }
482:     #endif
483:   PETSC_CUPM_ALIAS_FUNCTION(Init)

485:   // stream management
486:   PETSC_CUPM_ALIAS_FUNCTION(EventCreate)
487:   PETSC_CUPM_ALIAS_FUNCTION(EventCreateWithFlags)
488:   PETSC_CUPM_ALIAS_FUNCTION(EventDestroy)
489:   PETSC_CUPM_ALIAS_FUNCTION(EventRecord)
490:   PETSC_CUPM_ALIAS_FUNCTION(EventSynchronize)
491:   PETSC_CUPM_ALIAS_FUNCTION(EventElapsedTime)
492:   PETSC_CUPM_ALIAS_FUNCTION(EventQuery)
493:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreate)
494:   PETSC_CUPM_ALIAS_FUNCTION(StreamCreateWithFlags)
495:   PETSC_CUPM_ALIAS_FUNCTION(StreamGetFlags)
496:   PETSC_CUPM_ALIAS_FUNCTION(StreamDestroy)
497:   PETSC_CUPM_ALIAS_FUNCTION(StreamWaitEvent)
498:   PETSC_CUPM_ALIAS_FUNCTION(StreamQuery)
499:   PETSC_CUPM_ALIAS_FUNCTION(StreamSynchronize)
500:   PETSC_CUPM_ALIAS_FUNCTION(DeviceSynchronize)
501:   PETSC_CUPM_ALIAS_FUNCTION(GetSymbolAddress)

503:   // memory management
504:   PETSC_CUPM_ALIAS_FUNCTION(Free)
505:   PETSC_CUPM_ALIAS_FUNCTION(Malloc)
506:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
507:   PETSC_CUPM_ALIAS_FUNCTION(MallocAsync);
508:   PETSC_CUPM_ALIAS_FUNCTION(FreeAsync);
509:     #else
510:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(MallocAsync, Malloc, 1)
511:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(FreeAsync, Free, 1)
512:     #endif
513:   PETSC_CUPM_ALIAS_FUNCTION(Memcpy)
514:   PETSC_CUPM_ALIAS_FUNCTION(MemcpyAsync)
515:   // hipMallocHost is deprecated
516:   PETSC_CUPM_ALIAS_FUNCTION_COMMON(MallocHost, HostMalloc)
517:   // hipFreeHost is deprecated
518:   PETSC_CUPM_ALIAS_FUNCTION_COMMON(FreeHost, HostFree)
519:   PETSC_CUPM_ALIAS_FUNCTION(Memset)
520:   PETSC_CUPM_ALIAS_FUNCTION(MemsetAsync)

522:       // launch control
523:       // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
524:       // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
525:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
526:   PETSC_CUPM_ALIAS_FUNCTION(LaunchHostFunc);
527:     #else
528:   PETSC_CXX_COMPAT_DECL(hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx))
529:   {
530:     // the only correct way to spoof this function is to do it synchronously...
531:     auto herr = hipStreamSynchronize(stream);
532:     if (PetscUnlikely(herr != hipSuccess)) return herr;
533:     fn(ctx);
534:     return herr;
535:   }
536:     #endif

538:   template <typename FunctionT, typename... KernelArgsT>
539:   PETSC_CXX_COMPAT_DECL(hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs))
540:   {
541:     void *args[] = {(void *)&kernelArgs...};
542:     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
543:   }
544: };
545:     #undef PETSC_CUPM_PREFIX_L
546:     #undef PETSC_CUPM_PREFIX_U
547:   #endif // PetscDefined(HAVE_HIP)

549:   // shorthand for bringing all of the typedefs from the base Interface class into your own,
550:   // it's annoying that c++ doesn't have a way to do this automatically
551:   #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
552:     PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
553:     using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
554:     /* types */ \
555:     using typename base_name::cupmComplex_t; \
556:     using typename base_name::cupmError_t; \
557:     using typename base_name::cupmEvent_t; \
558:     using typename base_name::cupmStream_t; \
559:     using typename base_name::cupmDeviceProp_t; \
560:     using typename base_name::cupmMemcpyKind_t; \
561:     using typename base_name::cupmPointerAttributes_t; \
562:     using typename base_name::cupmMemoryType_t; \
563:     using typename base_name::cupmDim3; \
564:     using typename base_name::cupmMemPool_t; \
565:     using typename base_name::cupmMemPoolAttr; \
566:     /* variables */ \
567:     using base_name::cupmSuccess; \
568:     using base_name::cupmErrorNotReady; \
569:     using base_name::cupmErrorDeviceAlreadyInUse; \
570:     using base_name::cupmErrorSetOnActiveProcess; \
571:     using base_name::cupmErrorStubLibrary; \
572:     using base_name::cupmErrorNoDevice; \
573:     using base_name::cupmStreamDefault; \
574:     using base_name::cupmStreamNonBlocking; \
575:     using base_name::cupmDeviceMapHost; \
576:     using base_name::cupmMemcpyHostToDevice; \
577:     using base_name::cupmMemcpyDeviceToHost; \
578:     using base_name::cupmMemcpyDeviceToDevice; \
579:     using base_name::cupmMemcpyHostToHost; \
580:     using base_name::cupmMemcpyDefault; \
581:     using base_name::cupmMemoryTypeHost; \
582:     using base_name::cupmMemoryTypeDevice; \
583:     using base_name::cupmMemoryTypeManaged; \
584:     using base_name::cupmEventDisableTiming; \
585:     using base_name::cupmHostAllocDefault; \
586:     using base_name::cupmHostAllocWriteCombined; \
587:     using base_name::cupmMemPoolAttrReleaseThreshold; \
588:     /* functions */ \
589:     using base_name::cupmGetErrorName; \
590:     using base_name::cupmGetErrorString; \
591:     using base_name::cupmGetLastError; \
592:     using base_name::cupmGetDeviceCount; \
593:     using base_name::cupmGetDeviceProperties; \
594:     using base_name::cupmGetDevice; \
595:     using base_name::cupmSetDevice; \
596:     using base_name::cupmGetDeviceFlags; \
597:     using base_name::cupmSetDeviceFlags; \
598:     using base_name::cupmPointerGetAttributes; \
599:     using base_name::cupmDeviceGetMemPool; \
600:     using base_name::cupmMemPoolSetAttribute; \
601:     using base_name::cupmInit; \
602:     using base_name::cupmEventCreate; \
603:     using base_name::cupmEventCreateWithFlags; \
604:     using base_name::cupmEventDestroy; \
605:     using base_name::cupmEventRecord; \
606:     using base_name::cupmEventSynchronize; \
607:     using base_name::cupmEventElapsedTime; \
608:     using base_name::cupmEventQuery; \
609:     using base_name::cupmStreamCreate; \
610:     using base_name::cupmStreamCreateWithFlags; \
611:     using base_name::cupmStreamGetFlags; \
612:     using base_name::cupmStreamDestroy; \
613:     using base_name::cupmStreamWaitEvent; \
614:     using base_name::cupmStreamQuery; \
615:     using base_name::cupmStreamSynchronize; \
616:     using base_name::cupmDeviceSynchronize; \
617:     using base_name::cupmGetSymbolAddress; \
618:     using base_name::cupmMalloc; \
619:     using base_name::cupmMallocAsync; \
620:     using base_name::cupmMemcpy; \
621:     using base_name::cupmMemcpyAsync; \
622:     using base_name::cupmMallocHost; \
623:     using base_name::cupmMemset; \
624:     using base_name::cupmMemsetAsync; \
625:     using base_name::cupmLaunchHostFunc

627: template <DeviceType>
628: struct Interface;

630: // The actual interface class
631: template <DeviceType T>
632: struct Interface : InterfaceImpl<T> {
633:   PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);

635:   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
636:   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;

638:   // REVIEW ME: this needs to be cleaned up, it is unreadable
639:   PETSC_CXX_COMPAT_DECL(constexpr auto makeCupmScalar(PetscScalar s))
640:   PETSC_DECLTYPE_AUTO_RETURNS(PetscIfPetscDefined(USE_COMPLEX, (cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)}), static_cast<cupmReal_t>(s)));

642:   PETSC_CXX_COMPAT_DECL(constexpr auto cupmScalarCast(const PetscScalar *s))
643:   PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<const cupmScalar_t *>(s));

645:   PETSC_CXX_COMPAT_DECL(constexpr auto cupmScalarCast(PetscScalar *s))
646:   PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<cupmScalar_t *>(s));

648:   PETSC_CXX_COMPAT_DECL(constexpr auto cupmRealCast(PetscReal *s))
649:   PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<cupmReal_t *>(s));

651:   PETSC_CXX_COMPAT_DECL(constexpr auto cupmRealCast(const PetscReal *s))
652:   PETSC_DECLTYPE_AUTO_RETURNS(reinterpret_cast<const cupmReal_t *>(s));

654:   #if !defined(PETSC_PKG_CUDA_VERSION_GE)
655:     #define PETSC_PKG_CUDA_VERSION_GE(...) 0
656:     #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
657:   #endif
658:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr))
659:   {
660:     cupmPointerAttributes_t attr;
661:     cupmError_t             cerr;

664:     if (registered) {
666:       *registered = PETSC_FALSE;
667:     }
668:     if (managed) {
670:       *managed = PETSC_FALSE;
671:     }
672:     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
673:     // a host pointer returns cudaErrorInvalidValue
674:     cerr = cupmPointerGetAttributes(&attr, data);
675:     cerr = cupmGetLastError();
676:       // HIP seems to always have used memoryType though
677:   #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
678:     const auto mtype = attr.memoryType;
679:     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
680:   #else
681:     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) cerr;
682:     const auto mtype = attr.type;
683:     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
684:   #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
685:     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
686:     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
687:     return 0;
688:   }
689:   #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
690:     #undef PETSC_PKG_CUDA_VERSION_GE
691:   #endif

693:   PETSC_CXX_COMPAT_DECL(PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode))
694:   {
695:     switch (mode) {
696:     case PETSC_DEVICE_COPY_HTOH:
697:       return cupmMemcpyHostToHost;
698:     case PETSC_DEVICE_COPY_HTOD:
699:       return cupmMemcpyHostToDevice;
700:     case PETSC_DEVICE_COPY_DTOD:
701:       return cupmMemcpyDeviceToDevice;
702:     case PETSC_DEVICE_COPY_DTOH:
703:       return cupmMemcpyDeviceToHost;
704:     case PETSC_DEVICE_COPY_AUTO:
705:       return cupmMemcpyDefault;
706:     }
707:     PetscUnreachable();
708:     return cupmMemcpyDefault;
709:   }

711:   // these change what the arguments mean, so need to namespace these
712:   template <typename M>
713:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr))
714:   {
715:     static_assert(!std::is_void<M>::value, "");

718:     if (PetscLikely(n)) {
719:       cupmMallocAsync(reinterpret_cast<void **>(ptr), n * sizeof(M), stream);
720:     } else {
721:       *ptr = nullptr;
722:     }
723:     return 0;
724:   }

726:   template <typename M>
727:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n))
728:   {
729:     PetscCUPMMallocAsync(ptr, n);
730:     return 0;
731:   }

733:   template <typename M>
734:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault))
735:   {
736:     static_assert(!std::is_void<M>::value, "");

739:     *ptr = nullptr;
740:     cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags);
741:     return 0;
742:   }

744:   template <typename D, typename S = D>
745:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false))
746:   {
747:     static_assert(sizeof(D) == sizeof(S), "");
748:     static_assert(!std::is_void<D>::value && !std::is_void<S>::value, "");
749:     const auto size = n * sizeof(D);

751:     if (PetscUnlikely(!n)) return 0;
755:     // do early return after nullptr check since we need to check that they arent both nullptrs
756:     if (PetscUnlikely(dest == src)) return 0;
757:     if (kind == cupmMemcpyHostToHost) {
758:       if (cupmStreamQuery(stream) == cupmSuccess) {
759:         PetscMemcpy(dest, src, size);
760:         return 0;
761:       }
762:       cupmGetLastError();
763:     }
764:     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
765:       cupmMemcpyAsync(dest, src, size, kind, stream);
766:     } else {
767:       cupmMemcpy(dest, src, size, kind);
768:     }

770:     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
771:     // (yet) or don't know the direction
772:     if (kind == cupmMemcpyDeviceToHost) {
773:       PetscLogGpuToCpu(size);
774:     } else if (kind == cupmMemcpyHostToDevice) {
775:       PetscLogCpuToGpu(size);
776:     }
777:     return 0;
778:   }

780:   template <typename D, typename S = D>
781:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemcpy(D *dest, const S *src, std::size_t n, cupmMemcpyKind_t kind))
782:   {
783:     PetscCUPMMemcpyAsync(dest, src, n, kind);
784:     return 0;
785:   }

787:   template <typename M>
788:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false))
789:   {
790:     static_assert(!std::is_void<M>::value, "");

792:     if (PetscLikely(n)) {
793:       const auto bytes = n * sizeof(M);

796:       if (stream || use_async) {
797:         cupmMemsetAsync(ptr, value, bytes, stream);
798:       } else {
799:         cupmMemset(ptr, value, bytes);
800:       }
801:     }
802:     return 0;
803:   }

805:   template <typename M>
806:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n))
807:   {
808:     PetscCUPMMemsetAsync(ptr, value, n);
809:     return 0;
810:   }

812:   // these we can transparently wrap, no need to namespace it to Petsc
813:   template <typename M>
814:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeAsync(M &&ptr, cupmStream_t stream = nullptr))
815:   {
816:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");

818:     if (ptr) {
819:       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);

821:       ptr = nullptr;
822:       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
823:     }
824:     return cupmSuccess;
825:   }

827:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr)) { return interface_type::cupmFreeAsync(ptr, stream); }

829:   template <typename M>
830:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFree(M &&ptr))
831:   {
832:     return cupmFreeAsync(std::forward<M>(ptr));
833:   }

835:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFree(std::nullptr_t ptr)) { return cupmFreeAsync(ptr); }

837:   template <typename M>
838:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeHost(M &&ptr))
839:   {
840:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
841:     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
842:     ptr             = nullptr;
843:     return cerr;
844:   }

846:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmFreeHost(std::nullptr_t ptr)) { return interface_type::cupmFreeHost(ptr); }

848:   // specific wrapper for device launch function, as the real function is a C routine and
849:   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
850:   // boils down to the fact that ultimately we pass a
851:   //
852:   // void *args[] = {(void*)&kernel_args...};
853:   //
854:   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
855:   // happen to the kernel arguments so we must do it ourselves here. This function does this in
856:   // 3 stages:
857:   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
858:   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
859:   //    expected by func() using the enumeration above (deduceKernelCall)
860:   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
861:   //    it. (interface_type::cupmLaunchKernel)
862:   template <typename F, typename... Args>
863:   PETSC_CXX_COMPAT_DECL(cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs))
864:   {
865:     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
866:   }

868:   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
869:   PETSC_CXX_COMPAT_DECL(PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs))
870:   {
871:     static_assert(block_size > 0, "");
872:     static_assert(warp_size > 0, "");
873:     // want block_size to be a multiple of the warp_size
874:     static_assert(block_size % warp_size == 0, "");
875:     const auto nthread = std::min(n, block_size);
876:     const auto nblock  = (n + block_size - 1) / block_size;

878:     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
879:     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
880:     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
881:     cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...);
882:     return 0;
883:   }

885: private:
886:   template <typename S, typename D, typename = void>
887:   struct is_static_castable : std::false_type { };

889:   template <typename S, typename D>
890:   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };

892:   template <typename D, typename S>
893:   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
894:   {
895:     return static_cast<D>(std::forward<S>(src));
896:   }

898:   template <typename D, typename S>
899:   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
900:   {
901:     return const_cast<D>(std::forward<S>(src));
902:   }

904:   template <typename F, typename... Args, std::size_t... Idx>
905:   PETSC_CXX_COMPAT_DECL(cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs))
906:   {
907:     // clang-format off
908:     return interface_type::template cupmLaunchKernel(
909:       std::forward<F>(func),
910:       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
911:       // can't static_cast() here since the function argument type may be cv-qualified, in
912:       // which case we would need to const_cast(). But you can only const_cast()
913:       // indirect types (pointers, references) and I don't want to add a
914:       // static_cast_that_becomes_a_const_cast() SFINAE monster to this template mess. C-style
915:       // casts luckily work here since it tries the following and uses the first one that
916:       // succeeds:
917:       // 1. const_cast()
918:       // 2. static_cast()
919:       // 3. static_cast() then const_cast()
920:       // 4. reinterpret_cast()...
921:       // hopefully we never get to reinterpret_cast() land
922:       //(typename util::func_traits<F>::template arg<Idx>::type)(kernelArgs)...
923:       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
924:     );
925:     // clang-format on
926:   }
927: };

929:   #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
930:     PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
931:     using base_name = ::Petsc::device::cupm::impl::Interface<T>; \
932:     using typename base_name::cupmReal_t; \
933:     using typename base_name::cupmScalar_t; \
934:     using base_name::makeCupmScalar; \
935:     using base_name::cupmScalarCast; \
936:     using base_name::cupmRealCast; \
937:     using base_name::PetscCUPMGetMemType; \
938:     using base_name::PetscCUPMMemset; \
939:     using base_name::PetscCUPMMemsetAsync; \
940:     using base_name::PetscCUPMMalloc; \
941:     using base_name::PetscCUPMMallocAsync; \
942:     using base_name::PetscCUPMMallocHost; \
943:     using base_name::PetscCUPMMemcpy; \
944:     using base_name::PetscCUPMMemcpyAsync; \
945:     using base_name::cupmFree; \
946:     using base_name::cupmFreeAsync; \
947:     using base_name::cupmFreeHost; \
948:     using base_name::cupmLaunchKernel; \
949:     using base_name::PetscCUPMLaunchKernel1D; \
950:     using base_name::PetscDeviceCopyModeToCUPMMemcpyKind

952: } // namespace impl

954: } // namespace cupm

956: } // namespace device

958: } // namespace Petsc

960: #endif /* __cplusplus */

962: #endif /* PETSCCUPMINTERFACE_HPP */