Actual source code: cupminterface.hpp
1: #ifndef PETSCCUPMINTERFACE_HPP
2: #define PETSCCUPMINTERFACE_HPP
4: #if defined(__cplusplus)
5: #include <petsc/private/cpputil.hpp>
6: #include <petsc/private/petscadvancedmacros.h>
7: #include <petscdevice_cupm.h>
9: #include <array>
11: namespace Petsc
12: {
14: namespace device
15: {
17: namespace cupm
18: {
20: // enum describing available cupm devices, this is used as the template parameter to any
21: // class subclassing the Interface or using it as a member variable
22: enum class DeviceType : int {
23: CUDA,
24: HIP
25: };
27: // clang-format off
28: static constexpr std::array<const char *const, 5> DeviceTypes = {
29: "cuda",
30: "hip",
31: "Petsc::Device::CUPM::DeviceType",
32: "Petsc::Device::CUPM::DeviceType::",
33: nullptr
34: };
35: // clang-format on
37: namespace impl
38: {
40: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
41: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
42: // functions can also be wrapped inline:
43: //
44: // PetscCallCUPM(foo<int,char,bool>());
45: #define PetscCallCUPM(...) \
46: do { \
47: const cupmError_t cerr_p_ = __VA_ARGS__; \
48: PetscCheck(cerr_p_ == cupmSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
49: } while (0)
51: #define PetscCallCUPMAbort(comm_, ...) \
52: do { \
53: const cupmError_t cerr_abort_p_ = __VA_ARGS__; \
54: PetscCheckAbort(cerr_abort_p_ == cupmSuccess, comm_, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_abort_p_), cupmGetErrorName(cerr_abort_p_), cupmGetErrorString(cerr_abort_p_)); \
55: } while (0)
57: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
58: //
59: // input params:
60: // our_name - the name of the alias
61: // their_name - the name of the function being aliased
62: //
63: // notes:
64: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
65: //
66: // example usage:
67: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
68: // template <typename... T>
69: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
70: // {
71: // return cudaMalloc(std::forward<T>(args)...);
72: // }
73: //
74: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
75: // template <typename... T>
76: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
77: // {
78: // return hipMalloc(std::forward<T>(args)...);
79: // }
80: #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)
82: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
83: // discard the last N arguments
84: //
85: // input params:
86: // our_name - the name of the alias
87: // their_name - the name of the function being aliased
88: // N - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
89: //
90: // notes:
91: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
92: //
93: // example use:
94: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
95: // template <typename... T, typename Tend>
96: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
97: // return type deduction*
98: // {
99: // (void)argend;
100: // return cudaMalloc(std::forward<T>(args)...);
101: // }
102: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)
104: // Base class that holds functions and variables that don't require CUDA or HIP to be present
105: // on the system
106: template <DeviceType T>
107: struct InterfaceBase {
108: static const DeviceType type = T;
110: PETSC_NODISCARD static constexpr const char *cupmName() noexcept
111: {
112: static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
113: static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
114: return std::get<util::to_underlying(T)>(DeviceTypes);
115: }
117: PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }
119: PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
120: };
122: // declare the base class static member variables
123: template <DeviceType T>
124: const DeviceType InterfaceBase<T>::type;
126: #define PETSC_CUPM_BASE_CLASS_HEADER(base_name, DEVICE_TYPE) \
127: using base_name = ::Petsc::device::cupm::impl::InterfaceBase<DEVICE_TYPE>; \
128: using base_name::type; \
129: using base_name::cupmName; \
130: using base_name::PETSC_DEVICE_CUPM; \
131: using base_name::PETSC_MEMTYPE_CUPM
133: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
134: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
135: // import them into classes.
136: template <DeviceType>
137: struct InterfaceImpl;
139: #if PetscDefined(HAVE_CUDA)
140: template <>
141: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
142: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::CUDA);
144: // typedefs
145: using cupmError_t = cudaError_t;
146: using cupmEvent_t = cudaEvent_t;
147: using cupmStream_t = cudaStream_t;
148: using cupmDeviceProp_t = cudaDeviceProp;
149: using cupmMemcpyKind_t = cudaMemcpyKind;
150: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
151: using cupmPointerAttributes_t = cudaPointerAttributes;
152: using cupmMemoryType_t = enum cudaMemoryType;
153: using cupmDim3 = dim3;
154: using cupmHostFn_t = cudaHostFn_t;
155: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
156: using cupmMemPool_t = cudaMemPool_t;
157: using cupmMemPoolAttr = cudaMemPoolAttr;
158: #else
159: using cupmMemPool_t = void *;
160: using cupmMemPoolAttr = unsigned int;
161: #endif
163: // values
164: static const auto cupmSuccess = cudaSuccess;
165: static const auto cupmErrorNotReady = cudaErrorNotReady;
166: static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
167: static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
168: static const auto cupmErrorStubLibrary =
169: #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
170: cudaErrorStubLibrary;
171: #else
172: cudaErrorInsufficientDriver;
173: #endif
175: static const auto cupmErrorNoDevice = cudaErrorNoDevice;
176: static const auto cupmStreamDefault = cudaStreamDefault;
177: static const auto cupmStreamNonBlocking = cudaStreamNonBlocking;
178: static const auto cupmDeviceMapHost = cudaDeviceMapHost;
179: static const auto cupmMemcpyHostToDevice = cudaMemcpyHostToDevice;
180: static const auto cupmMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
181: static const auto cupmMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
182: static const auto cupmMemcpyHostToHost = cudaMemcpyHostToHost;
183: static const auto cupmMemcpyDefault = cudaMemcpyDefault;
184: static const auto cupmMemoryTypeHost = cudaMemoryTypeHost;
185: static const auto cupmMemoryTypeDevice = cudaMemoryTypeDevice;
186: static const auto cupmMemoryTypeManaged = cudaMemoryTypeManaged;
187: static const auto cupmEventDisableTiming = cudaEventDisableTiming;
188: static const auto cupmHostAllocDefault = cudaHostAllocDefault;
189: static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
190: static const auto cupmMemPoolAttrReleaseThreshold =
191: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
192: cudaMemPoolAttrReleaseThreshold;
193: #else
194: cupmMemPoolAttr{0};
195: #endif
197: // error functions
198: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
199: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
200: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)
202: // device management
203: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
204: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
205: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
206: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
207: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
208: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
209: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
210: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
211: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
212: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
213: #else
214: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
215: {
216: *pool = nullptr;
217: return cupmSuccess;
218: }
220: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
221: #endif
222: // CUDA has no cudaInit() to match hipInit()
223: PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }
225: // stream management
226: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
227: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
228: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
229: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
230: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
231: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
232: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
233: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
234: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
235: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
236: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
237: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
238: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
239: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
240: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
241: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)
243: // memory management
244: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
245: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
246: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
247: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
248: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
249: #else
250: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
251: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
252: #endif
253: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
254: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
255: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
256: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
257: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
258: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
259: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
260: #else
261: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
262: #endif
264: // launch control
265: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
266: template <typename FunctionT, typename... KernelArgsT>
267: PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
268: {
269: static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");
271: void *args[] = {(void *)&kernelArgs...};
272: return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
273: }
274: };
275: #endif // PetscDefined(HAVE_CUDA)
277: #if PetscDefined(HAVE_HIP)
278: template <>
279: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
280: PETSC_CUPM_BASE_CLASS_HEADER(base_type, DeviceType::HIP);
282: // typedefs
283: using cupmError_t = hipError_t;
284: using cupmEvent_t = hipEvent_t;
285: using cupmStream_t = hipStream_t;
286: using cupmDeviceProp_t = hipDeviceProp_t;
287: using cupmMemcpyKind_t = hipMemcpyKind;
288: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
289: using cupmPointerAttributes_t = hipPointerAttribute_t;
290: using cupmMemoryType_t = enum hipMemoryType;
291: using cupmDim3 = dim3;
292: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
293: using cupmHostFn_t = hipHostFn_t;
294: using cupmMemPool_t = hipMemPool_t;
295: using cupmMemPoolAttr = hipMemPoolAttr;
296: #else
297: using cupmHostFn_t = void (*)(void *);
298: using cupmMemPool_t = void *;
299: using cupmMemPoolAttr = unsigned int;
300: #endif
302: // values
303: static const auto cupmSuccess = hipSuccess;
304: static const auto cupmErrorNotReady = hipErrorNotReady;
305: // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
306: static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
307: static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
308: // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
309: static const auto cupmErrorStubLibrary = hipErrorInsufficientDriver;
310: static const auto cupmErrorNoDevice = hipErrorNoDevice;
311: static const auto cupmStreamDefault = hipStreamDefault;
312: static const auto cupmStreamNonBlocking = hipStreamNonBlocking;
313: static const auto cupmDeviceMapHost = hipDeviceMapHost;
314: static const auto cupmMemcpyHostToDevice = hipMemcpyHostToDevice;
315: static const auto cupmMemcpyDeviceToHost = hipMemcpyDeviceToHost;
316: static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
317: static const auto cupmMemcpyHostToHost = hipMemcpyHostToHost;
318: static const auto cupmMemcpyDefault = hipMemcpyDefault;
319: static const auto cupmMemoryTypeHost = hipMemoryTypeHost;
320: static const auto cupmMemoryTypeDevice = hipMemoryTypeDevice;
321: // see
322: // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
323: static const auto cupmMemoryTypeManaged = hipMemoryTypeUnified;
324: static const auto cupmEventDisableTiming = hipEventDisableTiming;
325: static const auto cupmHostAllocDefault = hipHostMallocDefault;
326: static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
327: static const auto cupmMemPoolAttrReleaseThreshold =
328: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
329: hipMemPoolAttrReleaseThreshold;
330: #else
331: cupmMemPoolAttr{0};
332: #endif
334: // error functions
335: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
336: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
337: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)
339: // device management
340: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
341: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
342: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
343: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
344: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
345: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
346: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
347: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
348: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
349: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
350: #else
351: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
352: {
353: *pool = nullptr;
354: return cupmSuccess;
355: }
357: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
358: #endif
359: PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)
361: // stream management
362: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
363: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
364: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
365: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
366: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
367: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
368: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
369: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
370: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
371: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
372: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
373: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
374: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
375: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
376: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
377: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)
379: // memory management
380: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
381: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
382: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
383: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
384: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
385: #else
386: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
387: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
388: #endif
389: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
390: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
391: // hipMallocHost is deprecated
392: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
393: // hipFreeHost is deprecated
394: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
395: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
396: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
398: // launch control
399: // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
400: // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
401: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
402: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
403: #else
404: PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
405: {
406: // the only correct way to spoof this function is to do it synchronously...
407: auto herr = hipStreamSynchronize(stream);
408: if (PetscUnlikely(herr != hipSuccess)) return herr;
409: fn(ctx);
410: return herr;
411: }
412: #endif
414: template <typename FunctionT, typename... KernelArgsT>
415: PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
416: {
417: void *args[] = {(void *)&kernelArgs...};
418: return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
419: }
420: };
421: #endif // PetscDefined(HAVE_HIP)
423: // shorthand for bringing all of the typedefs from the base Interface class into your own,
424: // it's annoying that c++ doesn't have a way to do this automatically
425: #define PETSC_CUPM_IMPL_CLASS_HEADER(base_name, T) \
426: PETSC_CUPM_BASE_CLASS_HEADER(PetscConcat(base_, base_name), T); \
427: using base_name = ::Petsc::device::cupm::impl::InterfaceImpl<T>; \
428: /* types */ \
429: using cupmComplex_t = typename base_name::cupmComplex_t; \
430: using cupmError_t = typename base_name::cupmError_t; \
431: using cupmEvent_t = typename base_name::cupmEvent_t; \
432: using cupmStream_t = typename base_name::cupmStream_t; \
433: using cupmDeviceProp_t = typename base_name::cupmDeviceProp_t; \
434: using cupmMemcpyKind_t = typename base_name::cupmMemcpyKind_t; \
435: using cupmPointerAttributes_t = typename base_name::cupmPointerAttributes_t; \
436: using cupmMemoryType_t = typename base_name::cupmMemoryType_t; \
437: using cupmDim3 = typename base_name::cupmDim3; \
438: using cupmMemPool_t = typename base_name::cupmMemPool_t; \
439: using cupmMemPoolAttr = typename base_name::cupmMemPoolAttr; \
440: /* variables */ \
441: using base_name::cupmSuccess; \
442: using base_name::cupmErrorNotReady; \
443: using base_name::cupmErrorDeviceAlreadyInUse; \
444: using base_name::cupmErrorSetOnActiveProcess; \
445: using base_name::cupmErrorStubLibrary; \
446: using base_name::cupmErrorNoDevice; \
447: using base_name::cupmStreamDefault; \
448: using base_name::cupmStreamNonBlocking; \
449: using base_name::cupmDeviceMapHost; \
450: using base_name::cupmMemcpyHostToDevice; \
451: using base_name::cupmMemcpyDeviceToHost; \
452: using base_name::cupmMemcpyDeviceToDevice; \
453: using base_name::cupmMemcpyHostToHost; \
454: using base_name::cupmMemcpyDefault; \
455: using base_name::cupmMemoryTypeHost; \
456: using base_name::cupmMemoryTypeDevice; \
457: using base_name::cupmMemoryTypeManaged; \
458: using base_name::cupmEventDisableTiming; \
459: using base_name::cupmHostAllocDefault; \
460: using base_name::cupmHostAllocWriteCombined; \
461: using base_name::cupmMemPoolAttrReleaseThreshold; \
462: /* functions */ \
463: using base_name::cupmGetErrorName; \
464: using base_name::cupmGetErrorString; \
465: using base_name::cupmGetLastError; \
466: using base_name::cupmGetDeviceCount; \
467: using base_name::cupmGetDeviceProperties; \
468: using base_name::cupmGetDevice; \
469: using base_name::cupmSetDevice; \
470: using base_name::cupmGetDeviceFlags; \
471: using base_name::cupmSetDeviceFlags; \
472: using base_name::cupmPointerGetAttributes; \
473: using base_name::cupmDeviceGetMemPool; \
474: using base_name::cupmMemPoolSetAttribute; \
475: using base_name::cupmInit; \
476: using base_name::cupmEventCreate; \
477: using base_name::cupmEventCreateWithFlags; \
478: using base_name::cupmEventDestroy; \
479: using base_name::cupmEventRecord; \
480: using base_name::cupmEventSynchronize; \
481: using base_name::cupmEventElapsedTime; \
482: using base_name::cupmEventQuery; \
483: using base_name::cupmStreamCreate; \
484: using base_name::cupmStreamCreateWithFlags; \
485: using base_name::cupmStreamGetFlags; \
486: using base_name::cupmStreamDestroy; \
487: using base_name::cupmStreamWaitEvent; \
488: using base_name::cupmStreamQuery; \
489: using base_name::cupmStreamSynchronize; \
490: using base_name::cupmDeviceSynchronize; \
491: using base_name::cupmGetSymbolAddress; \
492: using base_name::cupmMalloc; \
493: using base_name::cupmMallocAsync; \
494: using base_name::cupmMemcpy; \
495: using base_name::cupmMemcpyAsync; \
496: using base_name::cupmMallocHost; \
497: using base_name::cupmMemset; \
498: using base_name::cupmMemsetAsync; \
499: using base_name::cupmLaunchHostFunc
501: template <DeviceType>
502: struct Interface;
504: // The actual interface class
505: template <DeviceType T>
506: struct Interface : InterfaceImpl<T> {
507: PETSC_CUPM_IMPL_CLASS_HEADER(interface_type, T);
509: using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
510: using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
512: PETSC_NODISCARD static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
513: {
514: #if PetscDefined(USE_COMPLEX)
515: return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
516: #else
517: return static_cast<cupmScalar_t>(s);
518: #endif
519: }
521: PETSC_NODISCARD static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }
523: PETSC_NODISCARD static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }
525: PETSC_NODISCARD static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }
527: PETSC_NODISCARD static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }
529: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
530: #define PETSC_PKG_CUDA_VERSION_GE(...) 0
531: #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
532: #endif
533: static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
534: {
535: cupmPointerAttributes_t attr;
536: cupmError_t cerr;
538: PetscFunctionBegin;
540: if (registered) {
542: *registered = PETSC_FALSE;
543: }
544: if (managed) {
546: *managed = PETSC_FALSE;
547: }
548: // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
549: // a host pointer returns cudaErrorInvalidValue
550: cerr = cupmPointerGetAttributes(&attr, data);
551: cerr = cupmGetLastError();
552: // HIP seems to always have used memoryType though
553: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || defined(__HIP_PLATFORM_HCC__)
554: const auto mtype = attr.memoryType;
555: if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
556: #else
557: if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
558: const auto mtype = attr.type;
559: if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
560: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || __HIP_PLATFORM_HCC__
561: if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
562: if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
563: PetscFunctionReturn(PETSC_SUCCESS);
564: }
565: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
566: #undef PETSC_PKG_CUDA_VERSION_GE
567: #endif
569: PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
570: {
571: switch (mode) {
572: case PETSC_DEVICE_COPY_HTOH:
573: return cupmMemcpyHostToHost;
574: case PETSC_DEVICE_COPY_HTOD:
575: return cupmMemcpyHostToDevice;
576: case PETSC_DEVICE_COPY_DTOD:
577: return cupmMemcpyDeviceToDevice;
578: case PETSC_DEVICE_COPY_DTOH:
579: return cupmMemcpyDeviceToHost;
580: case PETSC_DEVICE_COPY_AUTO:
581: return cupmMemcpyDefault;
582: }
583: PetscUnreachable();
584: return cupmMemcpyDefault;
585: }
587: // these change what the arguments mean, so need to namespace these
588: template <typename M>
589: static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
590: {
591: static_assert(!std::is_void<M>::value, "");
593: PetscFunctionBegin;
595: *ptr = nullptr;
596: if (n) {
597: const auto bytes = n * sizeof(M);
598: // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
599: //
600: // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
601: // underpin its cuda-aware MPI implementation, so we cannot just async allocate
602: // blindly...
603: if (stream) {
604: PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
605: } else {
606: PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
607: }
608: }
609: PetscFunctionReturn(PETSC_SUCCESS);
610: }
612: template <typename M>
613: static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
614: {
615: PetscFunctionBegin;
616: PetscCall(PetscCUPMMallocAsync(ptr, n));
617: PetscFunctionReturn(PETSC_SUCCESS);
618: }
620: template <typename M>
621: static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
622: {
623: static_assert(!std::is_void<M>::value, "");
625: PetscFunctionBegin;
627: *ptr = nullptr;
628: if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
629: PetscFunctionReturn(PETSC_SUCCESS);
630: }
632: template <typename D>
633: static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
634: {
635: static_assert(!std::is_void<D>::value, "");
636: const auto size = n * sizeof(D);
638: PetscFunctionBegin;
639: if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
641: PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
642: PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
643: // do early return after nullptr check since we need to check that they are not both nullptrs
644: if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
645: if (kind == cupmMemcpyHostToHost) {
646: // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
647: // than it is to just call the vendor functions. This assumes of course that the stream
648: // accounts for both memory regions being "idle"
649: if (cupmStreamQuery(stream) == cupmSuccess) {
650: PetscCall(PetscMemcpy(dest, src, size));
651: PetscFunctionReturn(PETSC_SUCCESS);
652: }
653: // need to clear the potential cupmErrorNotReady generated by query above...
654: auto cerr = cupmGetLastError();
656: if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
657: }
658: if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
659: PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
660: } else {
661: PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
662: }
664: // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
665: // (yet) or don't know the direction
666: if (kind == cupmMemcpyDeviceToHost) {
667: PetscCall(PetscLogGpuToCpu(size));
668: } else if (kind == cupmMemcpyHostToDevice) {
669: PetscCall(PetscLogCpuToGpu(size));
670: }
671: PetscFunctionReturn(PETSC_SUCCESS);
672: }
674: template <typename D>
675: static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
676: {
677: PetscFunctionBegin;
678: PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
679: PetscFunctionReturn(PETSC_SUCCESS);
680: }
682: template <typename M>
683: static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
684: {
685: static_assert(!std::is_void<M>::value, "");
687: PetscFunctionBegin;
688: if (PetscLikely(n)) {
689: const auto bytes = n * sizeof(M);
691: PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
692: if (stream || use_async) {
693: PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
694: } else {
695: PetscCallCUPM(cupmMemset(ptr, value, bytes));
696: }
697: }
698: PetscFunctionReturn(PETSC_SUCCESS);
699: }
701: template <typename M>
702: static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
703: {
704: PetscFunctionBegin;
705: PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
706: PetscFunctionReturn(PETSC_SUCCESS);
707: }
709: // these we can transparently wrap, no need to namespace it to Petsc
710: template <typename M>
711: PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
712: {
713: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
714: static_assert(!std::is_const<M>::value, "");
716: if (ptr) {
717: auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
719: ptr = nullptr;
720: if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
721: }
722: return cupmSuccess;
723: }
725: PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }
727: template <typename M>
728: PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
729: {
730: return cupmFreeAsync(ptr);
731: }
733: PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }
735: template <typename M>
736: PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
737: {
738: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
739: const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
740: ptr = nullptr;
741: return cerr;
742: }
744: PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }
746: // specific wrapper for device launch function, as the real function is a C routine and
747: // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
748: // boils down to the fact that ultimately we pass a
749: //
750: // void *args[] = {(void*)&kernel_args...};
751: //
752: // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
753: // happen to the kernel arguments so we must do it ourselves here. This function does this in
754: // 3 stages:
755: // 1. Enumerate the kernel arguments (cupmLaunchKernel)
756: // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
757: // expected by func() using the enumeration above (deduceKernelCall)
758: // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
759: // it. (interface_type::cupmLaunchKernel)
760: template <typename F, typename... Args>
761: PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
762: {
763: return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
764: }
766: template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
767: static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
768: {
769: static_assert(block_size > 0, "");
770: static_assert(warp_size > 0, "");
771: // want block_size to be a multiple of the warp_size
772: static_assert(block_size % warp_size == 0, "");
773: const auto nthread = std::min(n, block_size);
774: const auto nblock = (n + block_size - 1) / block_size;
776: PetscFunctionBegin;
777: // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
778: // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
779: PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
780: PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), nblock, nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
781: PetscFunctionReturn(PETSC_SUCCESS);
782: }
784: private:
785: template <typename S, typename D, typename = void>
786: struct is_static_castable : std::false_type { };
788: template <typename S, typename D>
789: struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
791: template <typename D, typename S>
792: static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
793: {
794: return static_cast<D>(std::forward<S>(src));
795: }
797: template <typename D, typename S>
798: static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
799: {
800: return const_cast<D>(std::forward<S>(src));
801: }
803: template <typename F, typename... Args, std::size_t... Idx>
804: PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
805: {
806: // clang-format off
807: return interface_type::template cupmLaunchKernel(
808: std::forward<F>(func),
809: std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
810: // can't static_cast() here since the function argument type may be cv-qualified, in
811: // which case we would need to const_cast(). But you can only const_cast() indirect types
812: // (pointers, references). So we need a SFINAE monster that is a static_cast() if
813: // possible, and a const_cast() if not. We could just use a C-style cast which *would*
814: // work here since it tries the following and uses the first one that succeeds:
815: //
816: // 1. const_cast()
817: // 2. static_cast()
818: // 3. static_cast() then const_cast()
819: // 4. reinterpret_cast()...
820: //
821: // the issue however is the final reinterpret_cast(). We absolutely cannot get there
822: // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
823: // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
824: cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
825: );
826: // clang-format on
827: }
828: };
830: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(base_name, T) \
831: PETSC_CUPM_IMPL_CLASS_HEADER(PetscConcat(base_name, _impl), T); \
832: using base_name = ::Petsc::device::cupm::impl::Interface<T>; \
833: using cupmReal_t = typename base_name::cupmReal_t; \
834: using cupmScalar_t = typename base_name::cupmScalar_t; \
835: using base_name::cupmScalarCast; \
836: using base_name::cupmScalarPtrCast; \
837: using base_name::cupmRealPtrCast; \
838: using base_name::PetscCUPMGetMemType; \
839: using base_name::PetscCUPMMemset; \
840: using base_name::PetscCUPMMemsetAsync; \
841: using base_name::PetscCUPMMalloc; \
842: using base_name::PetscCUPMMallocAsync; \
843: using base_name::PetscCUPMMallocHost; \
844: using base_name::PetscCUPMMemcpy; \
845: using base_name::PetscCUPMMemcpyAsync; \
846: using base_name::cupmFree; \
847: using base_name::cupmFreeAsync; \
848: using base_name::cupmFreeHost; \
849: using base_name::cupmLaunchKernel; \
850: using base_name::PetscCUPMLaunchKernel1D; \
851: using base_name::PetscDeviceCopyModeToCUPMMemcpyKind
853: } // namespace impl
855: } // namespace cupm
857: } // namespace device
859: } // namespace Petsc
861: #endif /* __cplusplus */
863: #endif /* PETSCCUPMINTERFACE_HPP */