Actual source code: veccupmimpl.h
1: #ifndef PETSCVECCUPMIMPL_H
2: #define PETSCVECCUPMIMPL_H
4: #include <petsc/private/vecimpl.h>
5: #include <../src/vec/vec/impls/dvecimpl.h>
7: #if PetscDefined(HAVE_NVSHMEM)
8: PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
9: PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
10: PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
11: PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
12: #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
13: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
14: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
15: PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *);
16: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
17: #else
18: #define PetscNvshmemFree(ptr) PETSC_SUCCESS
19: #endif
21: #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
22: #include <petsc/private/deviceimpl.h>
23: #include <petsc/private/cupmblasinterface.hpp>
25: #include <petsc/private/cpp/functional.hpp>
27: #include <limits> // std::numeric_limits
28: #include <cstring> // std::memset
30: namespace Petsc
31: {
33: namespace vec
34: {
36: namespace cupm
37: {
39: namespace impl
40: {
42: namespace
43: {
45: // ==========================================================================================
46: // UseCUPMHostAlloc_
47: //
48: // A simple RAII helper for PetscMallocSet[CUDA|HIP]Host(). it exists because integrating the
49: // regular versions would be an enormous pain to square with the templated types...
50: // ==========================================================================================
51: template <device::cupm::DeviceType T>
52: class UseCUPMHostAlloc_ : device::cupm::impl::Interface<T> {
53: public:
54: PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(interface_type, T);
56: UseCUPMHostAlloc_(bool) noexcept;
57: ~UseCUPMHostAlloc_() noexcept;
59: PETSC_NODISCARD bool value() const noexcept;
61: private:
62: // would have loved to just do
63: //
64: // const auto oldmalloc = PetscTrMalloc;
65: //
66: // but in order to use auto the member needs to be static; in order to be static it must
67: // also be constexpr -- which in turn requires an initializer (also implicitly required by
68: // auto). But constexpr needs a constant expression initializer, so we can't initialize it
69: // with global (mutable) variables...
70: #define DECLTYPE_AUTO(left, right) decltype(right) left = right
71: const DECLTYPE_AUTO(oldmalloc_, PetscTrMalloc);
72: const DECLTYPE_AUTO(oldfree_, PetscTrFree);
73: const DECLTYPE_AUTO(oldrealloc_, PetscTrRealloc);
74: #undef DECLTYPE_AUTO
75: bool v_;
76: };
78: template <device::cupm::DeviceType T>
79: inline UseCUPMHostAlloc_<T>::UseCUPMHostAlloc_(bool useit) noexcept : v_(useit)
80: {
81: PetscFunctionBegin;
82: if (useit) {
83: // all unused arguments are un-named, this saves having to add PETSC_UNUSED to them all
84: PetscTrMalloc = [](std::size_t sz, PetscBool clear, int, const char *, const char *, void **ptr) {
85: PetscFunctionBegin;
86: PetscCallCUPM(cupmMallocHost(ptr, sz));
87: if (clear) std::memset(*ptr, 0, sz);
88: PetscFunctionReturn(PETSC_SUCCESS);
89: };
90: PetscTrFree = [](void *ptr, int, const char *, const char *) {
91: PetscFunctionBegin;
92: PetscCallCUPM(cupmFreeHost(ptr));
93: PetscFunctionReturn(PETSC_SUCCESS);
94: };
95: PetscTrRealloc = [](std::size_t, int, const char *, const char *, void **) {
96: // REVIEW ME: can be implemented by malloc->copy->free?
97: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "%s has no realloc()", cupmName());
98: };
99: }
100: PetscFunctionReturnVoid();
101: }
103: template <device::cupm::DeviceType T>
104: inline bool UseCUPMHostAlloc_<T>::value() const noexcept
105: {
106: return v_;
107: }
109: template <device::cupm::DeviceType T>
110: inline UseCUPMHostAlloc_<T>::~UseCUPMHostAlloc_() noexcept
111: {
112: PetscFunctionBegin;
113: if (value()) {
114: PetscTrMalloc = oldmalloc_;
115: PetscTrFree = oldfree_;
116: PetscTrRealloc = oldrealloc_;
117: }
118: PetscFunctionReturnVoid();
119: }
121: struct no_op {
122: template <typename... T>
123: constexpr PetscErrorCode operator()(T &&...) const noexcept
124: {
125: return PETSC_SUCCESS;
126: }
127: };
129: template <typename T>
130: struct CooPair {
131: using value_type = T;
132: using size_type = PetscCount;
134: value_type *&device;
135: value_type *&host;
136: size_type size;
137: };
139: template <typename U>
140: static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
141: {
142: return {device, host, size};
143: }
145: } // anonymous namespace
147: // forward declarations
148: template <device::cupm::DeviceType>
149: class VecSeq_CUPM;
150: template <device::cupm::DeviceType>
151: class VecMPI_CUPM;
153: // ==========================================================================================
154: // Vec_CUPMBase
155: //
156: // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
157: // template parameter it also uses CRTP to be able to use values/calls specific to either
158: // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
159: // ==========================================================================================
160: template <device::cupm::DeviceType T, typename Derived>
161: class Vec_CUPMBase : device::cupm::impl::BlasInterface<T> {
162: public:
163: PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, T);
164: // ==========================================================================================
165: // Vec_CUPMBase::vector_array
166: //
167: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
168: // holds the pointer itself provides the implicit conversion operator
169: // ==========================================================================================
170: template <PetscMemType, PetscMemoryAccessMode>
171: class vector_array;
173: private:
174: // A debug check to ensure that a given pointer-memtype pairing taken from user-land is
175: // actually correct. Errors on mismatch
176: static PetscErrorCode CheckPointerMatchesMemType_(const void *ptr, PetscMemType mtype) noexcept
177: {
178: PetscFunctionBegin;
179: if (PetscDefined(USE_DEBUG) && ptr) {
180: PetscMemType ptr_mtype;
182: PetscCall(PetscCUPMGetMemType(ptr, &ptr_mtype));
183: if (mtype == PETSC_MEMTYPE_HOST) {
184: PetscCheck(PetscMemTypeHost(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
185: } else if (mtype == PETSC_MEMTYPE_DEVICE) {
186: // generic "device" memory should only care if the actual memtype is also generically
187: // "device"
188: PetscCheck(PetscMemTypeDevice(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
189: } else {
190: PetscCheck(mtype == ptr_mtype, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
191: }
192: }
193: PetscFunctionReturn(PETSC_SUCCESS);
194: }
196: // The final stop in the GetHandles_/GetFromHandles_ chain. This retrieves the various
197: // compute handles and ensure the given PetscDeviceContext is of the right type
198: static PetscErrorCode GetFromHandleDispatch_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t *) noexcept;
199: static PetscErrorCode GetHandleDispatch_(PetscDeviceContext *, cupmBlasHandle_t *, cupmStream_t *) noexcept;
201: protected:
202: static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
203: {
204: const auto pobj = PetscObjectCast(v);
205: const auto vimpl = VecIMPLCast(v);
206: const auto vcu = VecCUPMCast(v);
207: PetscMemType mtype;
208: MPI_Comm comm;
210: PetscFunctionBegin;
213: PetscCall(PetscObjectGetComm(pobj, &comm));
214: PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
215: PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
216: PetscCall(PetscPrintf(comm, "Address: %p\n", v));
217: PetscCall(PetscPrintf(comm, "Size: %" PetscInt_FMT "\n", v->map->n));
218: PetscCall(PetscPrintf(comm, "Offload mask: %s\n", PetscOffloadMaskToString(v->offloadmask)));
219: PetscCall(PetscPrintf(comm, "Host ptr: %p\n", vimpl->array));
220: PetscCall(PetscPrintf(comm, "Device ptr: %p\n", vcu->array_d));
221: PetscCall(PetscPrintf(comm, "Device alloced ptr: %p\n", vcu->array_allocated_d));
222: PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
223: PetscCall(PetscPrintf(comm, "dptr is device mem? %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: // Helper routines to retrieve various combinations of handles. The first set (GetHandles_)
228: // gets a PetscDeviceContext along with it, while the second set (GetHandlesFrom_) assumes
229: // you've gotten the PetscDeviceContext already, and retrieves the handles from it. All of
230: // them check that the PetscDeviceContext is of the appropriate type
231: static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmBlasHandle_t * = nullptr, cupmStream_t * = nullptr) noexcept;
232: static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmStream_t *) noexcept;
233: static PetscErrorCode GetHandles_(cupmStream_t *) noexcept;
234: static PetscErrorCode GetHandles_(cupmBlasHandle_t *) noexcept;
236: static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t * = nullptr) noexcept;
237: static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmStream_t *) noexcept;
239: // Delete the allocated device array if required and replace it with the given array
240: static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
241: // Check either the host or device impl pointer is allocated and allocate it if
242: // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
243: template <typename CastFunctionType>
244: static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
245: // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
246: static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
247: // Check the Host part (v->data) is allocated, otherwise allocate it
248: static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
249: // Check the Host array is allocated, otherwise allocate it
250: static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
251: // Check the CUPM array is allocated, otherwise allocate it
252: static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
253: // Copy HTOD, allocating device if necessary
254: static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
255: // Copy DTOH, allocating host if necessary
256: static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;
258: public:
259: struct Vec_CUPM {
260: PetscScalar *array_d; // gpu data
261: PetscScalar *array_allocated_d; // does PETSc own the array ptr?
262: PetscBool nvshmem; // is array allocated in nvshmem? It is used to allocate
263: // Mvctx->lvec in nvshmem
265: // COO stuff
266: PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
267: // in COO arrays
268: PetscCount *perm1_d; // [tot1]: permutation array for local entries
269: PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
270: // the vector
271: PetscCount *jmap2_d; // [nnz2+1]
272: PetscCount *perm2_d; // [recvlen]
273: PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
274: // communication
276: // Buffers for remote values in VecSetValuesCOO()
277: PetscScalar *sendbuf_d;
278: PetscScalar *recvbuf_d;
279: };
281: // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
282: PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
283: // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
284: template <typename U = Derived>
285: PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
286: // Get the PetscLogEvents for HTOD and DTOH
287: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
288: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
289: // Get the VecTypes
290: PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
291: PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
292: // Get the VecType of the calling vector
293: template <typename U = Derived>
294: PETSC_NODISCARD static constexpr VecType VECIMPLCUPM() noexcept;
295: PETSC_NODISCARD static constexpr PetscRandomType PETSCDEVICERAND() noexcept;
297: // Call the host destroy function, i.e. VecDestroy_Seq()
298: static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
299: // Call the host reset function, i.e. VecResetArray_Seq()
300: static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
301: // ... you get the idea
302: static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
303: // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
304: // along with it if needed
305: static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;
307: // Shorthand for creating vector_array's. Need functions to create them, otherwise using them
308: // as an unnamed temporary leads to most vexing parse
309: PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
310: PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
311: PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
312: PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
313: PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
314: PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
316: // disallow implicit conversion
317: template <typename U>
318: PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(U) noexcept = delete;
319: // utility for using cupmHostAlloc()
320: PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(bool) noexcept;
321: PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(PetscBool) noexcept;
323: // ops-table functions
324: static PetscErrorCode create(Vec) noexcept;
325: static PetscErrorCode destroy(Vec) noexcept;
326: template <PetscMemType, PetscMemoryAccessMode, bool = false>
327: static PetscErrorCode getarray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
328: template <PetscMemType, PetscMemoryAccessMode, bool = false>
329: static PetscErrorCode getarray(Vec, PetscScalar **) noexcept;
330: template <PetscMemType, PetscMemoryAccessMode>
331: static PetscErrorCode restorearray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
332: template <PetscMemType, PetscMemoryAccessMode>
333: static PetscErrorCode restorearray(Vec, PetscScalar **) noexcept;
334: template <PetscMemoryAccessMode>
335: static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
336: template <PetscMemoryAccessMode>
337: static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
338: template <PetscMemoryAccessMode>
339: static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
340: template <PetscMemoryAccessMode>
341: static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **) noexcept;
342: template <PetscMemType>
343: static PetscErrorCode replacearray(Vec, const PetscScalar *) noexcept;
344: template <PetscMemType>
345: static PetscErrorCode resetarray(Vec) noexcept;
346: template <PetscMemType>
347: static PetscErrorCode placearray(Vec, const PetscScalar *) noexcept;
349: // common ops shared between Seq and MPI
350: static PetscErrorCode Create_CUPM(Vec) noexcept;
351: static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
352: static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
353: template <typename SetupFunctionT = no_op>
354: static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
355: static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
356: static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
357: static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
358: template <std::size_t NCount = 0, std::size_t NScal = 0>
359: static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
360: };
362: // ==========================================================================================
363: // Vec_CUPMBase::vector_array
364: //
365: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
366: // holds the pointer itself and provides the implicit conversion operator.
367: //
368: // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
369: // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
370: // for you
371: // ==========================================================================================
372: template <device::cupm::DeviceType T, typename D>
373: template <PetscMemType MT, PetscMemoryAccessMode MA>
374: class Vec_CUPMBase<T, D>::vector_array {
375: public:
376: static const auto memory_type = MT;
377: static const auto access_type = MA;
379: using value_type = PetscScalar;
380: using pointer_type = value_type *;
381: using cupm_pointer_type = cupmScalar_t *;
383: vector_array(PetscDeviceContext, Vec) noexcept;
384: ~vector_array() noexcept;
386: constexpr vector_array(vector_array &&) noexcept = default;
387: constexpr vector_array &operator=(vector_array &&) noexcept = default;
389: pointer_type data() const noexcept;
390: cupm_pointer_type cupmdata() const noexcept;
392: operator pointer_type() const noexcept;
393: // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
394: // we make a dummy template parameter to allow SFINAE to nix it for us
395: template <typename U = pointer_type, typename = util::enable_if_t<!std::is_same<U, cupm_pointer_type>::value>>
396: operator cupm_pointer_type() const noexcept;
398: private:
399: pointer_type ptr_ = nullptr;
400: PetscDeviceContext dctx_ = nullptr;
401: Vec v_ = nullptr;
402: };
404: // ==========================================================================================
405: // Vec_CUPMBase::vector_array - Static Variables
406: // ==========================================================================================
408: template <device::cupm::DeviceType T, typename D>
409: template <PetscMemType MT, PetscMemoryAccessMode MA>
410: const PetscMemType Vec_CUPMBase<T, D>::vector_array<MT, MA>::memory_type;
412: template <device::cupm::DeviceType T, typename D>
413: template <PetscMemType MT, PetscMemoryAccessMode MA>
414: const PetscMemoryAccessMode Vec_CUPMBase<T, D>::vector_array<MT, MA>::access_type;
416: // ==========================================================================================
417: // Vec_CUPMBase::vector_array - Public API
418: // ==========================================================================================
420: template <device::cupm::DeviceType T, typename D>
421: template <PetscMemType MT, PetscMemoryAccessMode MA>
422: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::vector_array(PetscDeviceContext dctx, Vec v) noexcept : dctx_(dctx), v_(v)
423: {
424: PetscFunctionBegin;
425: PetscCallAbort(PETSC_COMM_SELF, getarray<MT, MA, true>(v, &ptr_, dctx));
426: PetscFunctionReturnVoid();
427: }
429: template <device::cupm::DeviceType T, typename D>
430: template <PetscMemType MT, PetscMemoryAccessMode MA>
431: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::~vector_array() noexcept
432: {
433: PetscFunctionBegin;
434: PetscCallAbort(PETSC_COMM_SELF, restorearray<MT, MA>(v_, &ptr_, dctx_));
435: PetscFunctionReturnVoid();
436: }
438: template <device::cupm::DeviceType T, typename D>
439: template <PetscMemType MT, PetscMemoryAccessMode MA>
440: inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::data() const noexcept
441: {
442: return ptr_;
443: }
445: template <device::cupm::DeviceType T, typename D>
446: template <PetscMemType MT, PetscMemoryAccessMode MA>
447: inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::cupm_pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::cupmdata() const noexcept
448: {
449: return cupmScalarPtrCast(data());
450: }
452: template <device::cupm::DeviceType T, typename D>
453: template <PetscMemType MT, PetscMemoryAccessMode MA>
454: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator pointer_type() const noexcept
455: {
456: return data();
457: }
459: // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
460: // we make a dummy template parameter to allow SFINAE to nix it for us
461: template <device::cupm::DeviceType T, typename D>
462: template <PetscMemType MT, PetscMemoryAccessMode MA>
463: template <typename U, typename>
464: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator cupm_pointer_type() const noexcept
465: {
466: return cupmdata();
467: }
469: // ==========================================================================================
470: // Vec_CUPMBase - Private API
471: // ==========================================================================================
473: template <device::cupm::DeviceType T, typename D>
474: inline PetscErrorCode Vec_CUPMBase<T, D>::GetFromHandleDispatch_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
475: {
476: PetscFunctionBegin;
480: if (PetscDefined(USE_DEBUG)) {
481: PetscDeviceType dtype;
483: PetscCall(PetscDeviceContextGetDeviceType(dctx, &dtype));
484: PetscCheckCompatibleDeviceTypes(PETSC_DEVICE_CUPM(), -1, dtype, 1);
485: }
486: if (handle) PetscCall(PetscDeviceContextGetBLASHandle_Internal(dctx, handle));
487: if (stream) PetscCall(PetscDeviceContextGetStreamHandle_Internal(dctx, stream));
488: PetscFunctionReturn(PETSC_SUCCESS);
489: }
491: template <device::cupm::DeviceType T, typename D>
492: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandleDispatch_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
493: {
494: PetscDeviceContext dctx_loc = nullptr;
496: PetscFunctionBegin;
497: // silence uninitialized variable warnings
498: if (dctx) *dctx = nullptr;
499: PetscCall(PetscDeviceContextGetCurrentContext(&dctx_loc));
500: PetscCall(GetFromHandleDispatch_(dctx_loc, handle, stream));
501: if (dctx) *dctx = dctx_loc;
502: PetscFunctionReturn(PETSC_SUCCESS);
503: }
505: // ==========================================================================================
506: // Vec_CUPMBase - Protected API
507: // ==========================================================================================
509: template <device::cupm::DeviceType T, typename D>
510: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
511: {
512: return GetHandleDispatch_(dctx, handle, stream);
513: }
515: template <device::cupm::DeviceType T, typename D>
516: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmStream_t *stream) noexcept
517: {
518: return GetHandles_(dctx, nullptr, stream);
519: }
521: template <device::cupm::DeviceType T, typename D>
522: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmStream_t *stream) noexcept
523: {
524: return GetHandles_(nullptr, stream);
525: }
527: template <device::cupm::DeviceType T, typename D>
528: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmBlasHandle_t *handle) noexcept
529: {
530: return GetHandles_(nullptr, handle);
531: }
533: template <device::cupm::DeviceType T, typename D>
534: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
535: {
536: return GetFromHandleDispatch_(dctx, handle, stream);
537: }
539: template <device::cupm::DeviceType T, typename D>
540: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmStream_t *stream) noexcept
541: {
542: return GetHandlesFrom_(dctx, nullptr, stream);
543: }
545: template <device::cupm::DeviceType T, typename D>
546: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
547: {
548: auto &device_array = VecCUPMCast(v)->array_allocated_d;
550: PetscFunctionBegin;
551: if (device_array) {
552: if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
553: PetscCall(PetscNvshmemFree(device_array));
554: } else {
555: cupmStream_t stream;
557: PetscCall(GetHandlesFrom_(dctx, &stream));
558: PetscCallCUPM(cupmFreeAsync(device_array, stream));
559: }
560: }
561: device_array = new_value;
562: PetscFunctionReturn(PETSC_SUCCESS);
563: }
565: namespace
566: {
568: inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept
569: {
570: auto mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
571: PetscBool flg;
573: PetscFunctionBegin;
574: PetscObjectOptionsBegin(PetscObjectCast(v));
575: PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
576: if (flg) v->minimum_bytes_pinned_memory = mem;
577: PetscOptionsEnd();
578: PetscFunctionReturn(PETSC_SUCCESS);
579: }
581: } // anonymous namespace
583: template <device::cupm::DeviceType T, typename D>
584: template <typename CastFunctionType>
585: inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
586: {
587: PetscFunctionBegin;
588: if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
589: // do the check here so we don't have to do it in every function
590: PetscCall(checkCupmBlasIntCast(v->map->n));
591: {
592: auto impl = cast(v);
594: PetscCall(PetscNew(&impl));
595: dest = impl;
596: }
597: PetscFunctionReturn(PETSC_SUCCESS);
598: }
600: template <device::cupm::DeviceType T, typename D>
601: inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
602: {
603: PetscFunctionBegin;
604: PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
605: PetscFunctionReturn(PETSC_SUCCESS);
606: }
608: // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
609: // certain circumstances (such as when the user places the device array) we do not want to do
610: // the full DeviceAllocateCheck_() as it also allocates the array
611: template <device::cupm::DeviceType T, typename D>
612: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
613: {
614: PetscFunctionBegin;
615: PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
616: PetscFunctionReturn(PETSC_SUCCESS);
617: }
619: template <device::cupm::DeviceType T, typename D>
620: inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
621: {
622: PetscFunctionBegin;
623: PetscCall(VecIMPLAllocateCheck_(v));
624: if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
625: else {
626: PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
627: {
628: const auto n = v->map->n;
629: const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);
631: v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
632: PetscCall(PetscMalloc1(n, &alloc));
633: }
634: if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
635: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
636: }
637: PetscFunctionReturn(PETSC_SUCCESS);
638: }
640: template <device::cupm::DeviceType T, typename D>
641: inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
642: {
643: PetscFunctionBegin;
644: PetscCall(VecCUPMAllocateCheck_(v));
645: if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
646: else {
647: const auto n = v->map->n;
648: auto &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
649: cupmStream_t stream;
651: PetscCall(GetHandlesFrom_(dctx, &stream));
652: PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
653: alloc = array_allocated_d;
654: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
655: const auto vimp = VecIMPLCast(v);
656: v->offloadmask = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
657: }
658: }
659: PetscFunctionReturn(PETSC_SUCCESS);
660: }
662: template <device::cupm::DeviceType T, typename D>
663: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
664: {
665: PetscFunctionBegin;
666: PetscCall(DeviceAllocateCheck_(dctx, v));
667: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
668: cupmStream_t stream;
670: v->offloadmask = PETSC_OFFLOAD_BOTH;
671: PetscCall(GetHandlesFrom_(dctx, &stream));
672: PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
673: PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
674: PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
675: }
676: PetscFunctionReturn(PETSC_SUCCESS);
677: }
679: template <device::cupm::DeviceType T, typename D>
680: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
681: {
682: PetscFunctionBegin;
683: PetscCall(HostAllocateCheck_(dctx, v));
684: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
685: cupmStream_t stream;
687: v->offloadmask = PETSC_OFFLOAD_BOTH;
688: PetscCall(GetHandlesFrom_(dctx, &stream));
689: PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
690: PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
691: PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
692: }
693: PetscFunctionReturn(PETSC_SUCCESS);
694: }
696: // ==========================================================================================
697: // Vec_CUPMBase - Public API
698: // ==========================================================================================
700: template <device::cupm::DeviceType T, typename D>
701: inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
702: {
703: return static_cast<Vec_CUPM *>(v->spptr);
704: }
706: // This is a trick to get around the fact that in CRTP the derived class is not yet fully
707: // defined because Base<Derived> must necessarily be instantiated before Derived is
708: // complete. By using a dummy template parameter we make the type "dependent" and so will
709: // only be determined when the derived class is instantiated (and therefore fully defined)
710: template <device::cupm::DeviceType T, typename D>
711: template <typename U>
712: inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
713: {
714: return U::VecIMPLCast_(v);
715: }
717: template <device::cupm::DeviceType T, typename D>
718: inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
719: {
720: return D::VecDestroy_IMPL_(v);
721: }
723: template <device::cupm::DeviceType T, typename D>
724: inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
725: {
726: return D::VecResetArray_IMPL_(v);
727: }
729: template <device::cupm::DeviceType T, typename D>
730: inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
731: {
732: return D::VecPlaceArray_IMPL_(v, a);
733: }
735: template <device::cupm::DeviceType T, typename D>
736: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
737: {
738: return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
739: }
741: template <device::cupm::DeviceType T, typename D>
742: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
743: {
744: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
745: }
747: template <device::cupm::DeviceType T, typename D>
748: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
749: {
750: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
751: }
753: template <device::cupm::DeviceType T, typename D>
754: inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
755: {
756: return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
757: }
759: template <device::cupm::DeviceType T, typename D>
760: inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
761: {
762: return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
763: }
765: template <device::cupm::DeviceType T, typename D>
766: template <typename U>
767: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
768: {
769: return U::VECIMPLCUPM_();
770: }
772: template <device::cupm::DeviceType T, typename D>
773: inline constexpr PetscRandomType Vec_CUPMBase<T, D>::PETSCDEVICERAND() noexcept
774: {
775: // REVIEW ME: HIP default rng?
776: return T == device::cupm::DeviceType::CUDA ? PETSCCURAND : PETSCRANDER48;
777: }
779: // utility for using cupmHostAlloc()
780: template <device::cupm::DeviceType T, typename D>
781: inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(bool b) noexcept
782: {
783: return {b};
784: }
786: template <device::cupm::DeviceType T, typename D>
787: inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(PetscBool b) noexcept
788: {
789: return UseCUPMHostAlloc(static_cast<bool>(b));
790: }
792: // private version that takes a PetscDeviceContext, called by the public variant
793: template <device::cupm::DeviceType T, typename D>
794: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
795: inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
796: {
797: constexpr auto hostmem = PetscMemTypeHost(mtype);
798: const auto oldmask = v->offloadmask;
799: auto &mask = v->offloadmask;
800: auto should_sync = false;
802: PetscFunctionBegin;
803: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
804: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
805: if (PetscMemoryAccessRead(access)) {
806: // READ or READ_WRITE
807: if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
808: // if we move the data we should set the flag to synchronize later on
809: should_sync = true;
810: }
811: PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
812: } else {
813: // WRITE only
814: PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
815: }
816: *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
817: // if unallocated previously we should zero things out if we intend to read
818: if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
819: const auto n = v->map->n;
821: if (hostmem) {
822: PetscCall(PetscArrayzero(*a, n));
823: } else {
824: cupmStream_t stream;
826: PetscCall(GetHandlesFrom_(dctx, &stream));
827: PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
828: should_sync = true;
829: }
830: }
831: // update the offloadmask if we intend to write, since we assume immediately modified
832: if (PetscMemoryAccessWrite(access)) {
833: PetscCall(VecSetErrorIfLocked(v, 1));
834: // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
835: // is immediately modified
836: mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
837: }
838: // if we are a globally blocking stream and we have MOVED data then we should synchronize,
839: // since even doing async calls on the NULL stream is not synchronous
840: if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
841: PetscFunctionReturn(PETSC_SUCCESS);
842: }
844: // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
845: template <device::cupm::DeviceType T, typename D>
846: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
847: inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a) noexcept
848: {
849: PetscDeviceContext dctx;
851: PetscFunctionBegin;
852: PetscCall(GetHandles_(&dctx));
853: PetscCall(getarray<mtype, access, force>(v, a, dctx));
854: PetscFunctionReturn(PETSC_SUCCESS);
855: }
857: // private version that takes a PetscDeviceContext, called by the public variant
858: template <device::cupm::DeviceType T, typename D>
859: template <PetscMemType mtype, PetscMemoryAccessMode access>
860: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
861: {
862: PetscFunctionBegin;
863: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
864: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
865: if (PetscMemoryAccessWrite(access)) {
866: // WRITE or READ_WRITE
867: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
868: v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
869: }
870: if (a) {
871: PetscCall(CheckPointerMatchesMemType_(*a, mtype));
872: *a = nullptr;
873: }
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
878: template <device::cupm::DeviceType T, typename D>
879: template <PetscMemType mtype, PetscMemoryAccessMode access>
880: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a) noexcept
881: {
882: PetscDeviceContext dctx;
884: PetscFunctionBegin;
885: PetscCall(GetHandles_(&dctx));
886: PetscCall(restorearray<mtype, access>(v, a, dctx));
887: PetscFunctionReturn(PETSC_SUCCESS);
888: }
890: template <device::cupm::DeviceType T, typename D>
891: template <PetscMemoryAccessMode access>
892: inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
893: {
894: PetscFunctionBegin;
895: PetscCall(getarray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
896: if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
897: PetscFunctionReturn(PETSC_SUCCESS);
898: }
900: // v->ops->getarrayandmemtype
901: template <device::cupm::DeviceType T, typename D>
902: template <PetscMemoryAccessMode access>
903: inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
904: {
905: PetscDeviceContext dctx;
907: PetscFunctionBegin;
908: PetscCall(GetHandles_(&dctx));
909: PetscCall(getarrayandmemtype<access>(v, a, mtype, dctx));
910: PetscFunctionReturn(PETSC_SUCCESS);
911: }
913: template <device::cupm::DeviceType T, typename D>
914: template <PetscMemoryAccessMode access>
915: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
916: {
917: PetscFunctionBegin;
918: PetscCall(restorearray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
919: PetscFunctionReturn(PETSC_SUCCESS);
920: }
922: // v->ops->restorearrayandmemtype
923: template <device::cupm::DeviceType T, typename D>
924: template <PetscMemoryAccessMode access>
925: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a) noexcept
926: {
927: PetscDeviceContext dctx;
929: PetscFunctionBegin;
930: PetscCall(GetHandles_(&dctx));
931: PetscCall(restorearrayandmemtype<access>(v, a, dctx));
932: PetscFunctionReturn(PETSC_SUCCESS);
933: }
935: // v->ops->placearray or VecCUPMPlaceArray()
936: template <device::cupm::DeviceType T, typename D>
937: template <PetscMemType mtype>
938: inline PetscErrorCode Vec_CUPMBase<T, D>::placearray(Vec v, const PetscScalar *a) noexcept
939: {
940: PetscDeviceContext dctx;
942: PetscFunctionBegin;
943: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
944: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
945: PetscCall(CheckPointerMatchesMemType_(a, mtype));
946: PetscCall(GetHandles_(&dctx));
947: if (PetscMemTypeHost(mtype)) {
948: PetscCall(CopyToHost_(dctx, v));
949: PetscCall(VecPlaceArray_IMPL(v, a));
950: v->offloadmask = PETSC_OFFLOAD_CPU;
951: } else {
952: PetscCall(VecIMPLAllocateCheck_(v));
953: {
954: auto &backup_array = VecIMPLCast(v)->unplacedarray;
956: PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
957: PetscCall(CopyToDevice_(dctx, v));
958: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
959: backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
960: // only update the offload mask if we actually assign a pointer
961: if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
962: }
963: }
964: PetscFunctionReturn(PETSC_SUCCESS);
965: }
967: // v->ops->replacearray or VecCUPMReplaceArray()
968: template <device::cupm::DeviceType T, typename D>
969: template <PetscMemType mtype>
970: inline PetscErrorCode Vec_CUPMBase<T, D>::replacearray(Vec v, const PetscScalar *a) noexcept
971: {
972: const auto aptr = const_cast<PetscScalar *>(a);
973: PetscDeviceContext dctx;
975: PetscFunctionBegin;
976: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
977: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
978: PetscCall(CheckPointerMatchesMemType_(a, mtype));
979: PetscCall(GetHandles_(&dctx));
980: if (PetscMemTypeHost(mtype)) {
981: PetscCall(VecIMPLAllocateCheck_(v));
982: {
983: const auto vimpl = VecIMPLCast(v);
984: auto &host_array = vimpl->array_allocated;
986: // make sure the users array has the latest values.
987: // REVIEW ME: why? we're about to free it
988: if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
989: if (host_array) {
990: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
992: PetscCall(PetscFree(host_array));
993: }
994: host_array = aptr;
995: vimpl->array = host_array;
996: v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
997: v->offloadmask = PETSC_OFFLOAD_CPU;
998: }
999: } else {
1000: PetscCall(VecCUPMAllocateCheck_(v));
1001: {
1002: const auto vcu = VecCUPMCast(v);
1004: PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
1005: // don't update the offloadmask if placed pointer is NULL
1006: vcu->array_d = vcu->array_allocated_d /* = aptr */;
1007: if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
1008: }
1009: }
1010: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1011: PetscFunctionReturn(PETSC_SUCCESS);
1012: }
1014: // v->ops->resetarray or VecCUPMResetArray()
1015: template <device::cupm::DeviceType T, typename D>
1016: template <PetscMemType mtype>
1017: inline PetscErrorCode Vec_CUPMBase<T, D>::resetarray(Vec v) noexcept
1018: {
1019: PetscDeviceContext dctx;
1021: PetscFunctionBegin;
1022: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1023: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1024: PetscCall(GetHandles_(&dctx));
1025: // REVIEW ME:
1026: // this is wildly inefficient but must be done if we assume that the placed array must have
1027: // correct values
1028: if (PetscMemTypeHost(mtype)) {
1029: PetscCall(CopyToHost_(dctx, v));
1030: PetscCall(VecResetArray_IMPL(v));
1031: v->offloadmask = PETSC_OFFLOAD_CPU;
1032: } else {
1033: PetscCall(VecIMPLAllocateCheck_(v));
1034: PetscCall(VecCUPMAllocateCheck_(v));
1035: {
1036: const auto vcu = VecCUPMCast(v);
1037: const auto vimpl = VecIMPLCast(v);
1038: auto &host_array = vimpl->unplacedarray;
1040: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
1041: PetscCall(CopyToDevice_(dctx, v));
1042: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1043: // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
1044: // otherwise check if the host has a valid pointer. If neither, then we are not
1045: // allocated.
1046: vcu->array_d = host_array;
1047: if (host_array) {
1048: host_array = nullptr;
1049: v->offloadmask = PETSC_OFFLOAD_GPU;
1050: } else if (vimpl->array) {
1051: v->offloadmask = PETSC_OFFLOAD_CPU;
1052: } else {
1053: v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1054: }
1055: }
1056: }
1057: PetscFunctionReturn(PETSC_SUCCESS);
1058: }
1060: // v->ops->create
1061: template <device::cupm::DeviceType T, typename D>
1062: inline PetscErrorCode Vec_CUPMBase<T, D>::create(Vec v) noexcept
1063: {
1064: PetscBool alloc_missing;
1065: PetscDeviceContext dctx;
1067: PetscFunctionBegin;
1068: PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
1069: PetscCall(GetHandles_(&dctx));
1070: PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
1071: PetscFunctionReturn(PETSC_SUCCESS);
1072: }
1074: // v->ops->destroy
1075: template <device::cupm::DeviceType T, typename D>
1076: inline PetscErrorCode Vec_CUPMBase<T, D>::destroy(Vec v) noexcept
1077: {
1078: PetscFunctionBegin;
1079: if (const auto vcu = VecCUPMCast(v)) {
1080: PetscDeviceContext dctx;
1082: PetscCall(GetHandles_(&dctx));
1083: PetscCall(ResetAllocatedDevicePtr_(dctx, v));
1084: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1085: PetscCall(PetscFree(v->spptr));
1086: }
1087: PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
1088: if (const auto vimpl = VecIMPLCast(v)) {
1089: if (auto &array_allocated = vimpl->array_allocated) {
1090: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
1092: // do this ourselves since we may want to use the cupm functions
1093: PetscCall(PetscFree(array_allocated));
1094: }
1095: }
1096: v->pinned_memory = PETSC_FALSE;
1097: PetscCall(VecDestroy_IMPL(v));
1098: PetscFunctionReturn(PETSC_SUCCESS);
1099: }
1101: // ================================================================================== //
1102: // Common core between Seq and MPI //
1104: // VecCreate_CUPM()
1105: template <device::cupm::DeviceType T, typename D>
1106: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
1107: {
1108: PetscMPIInt size;
1110: PetscFunctionBegin;
1111: PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
1112: PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
1113: PetscFunctionReturn(PETSC_SUCCESS);
1114: }
1116: // VecCreateCUPM()
1117: template <device::cupm::DeviceType T, typename D>
1118: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
1119: {
1120: PetscFunctionBegin;
1121: PetscCall(VecCreate(comm, v));
1122: if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
1123: PetscCall(VecSetSizes(*v, n, N));
1124: if (bs) PetscCall(VecSetBlockSize(*v, bs));
1125: if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
1126: PetscFunctionReturn(PETSC_SUCCESS);
1127: }
1129: // VecCreateIMPL_CUPM(), called through v->ops->create
1130: template <device::cupm::DeviceType T, typename D>
1131: inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
1132: {
1133: PetscFunctionBegin;
1134: // REVIEW ME: perhaps not needed
1135: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
1136: PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
1137: PetscCall(D::bindtocpu(v, PETSC_FALSE));
1138: if (device_array) {
1139: PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
1140: PetscCall(VecCUPMAllocateCheck_(v));
1141: VecCUPMCast(v)->array_d = device_array;
1142: }
1143: if (host_array) {
1144: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
1145: VecIMPLCast(v)->array = host_array;
1146: }
1147: if (allocate_missing) {
1148: PetscCall(DeviceAllocateCheck_(dctx, v));
1149: PetscCall(HostAllocateCheck_(dctx, v));
1150: // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
1151: // set() for reference
1152: // calls device-version
1153: PetscCall(VecSet(v, 0));
1154: // zero the host while device is underway
1155: PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
1156: v->offloadmask = PETSC_OFFLOAD_BOTH;
1157: } else {
1158: if (host_array) {
1159: v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
1160: } else {
1161: v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
1162: }
1163: }
1164: PetscFunctionReturn(PETSC_SUCCESS);
1165: }
1167: // v->ops->duplicate
1168: template <device::cupm::DeviceType T, typename D>
1169: template <typename SetupFunctionT>
1170: inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
1171: {
1172: // if the derived setup is the default no_op then we should call VecSetType()
1173: constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
1174: const auto vobj = PetscObjectCast(v);
1175: const auto map = v->map;
1176: PetscInt bs;
1178: PetscFunctionBegin;
1179: PetscCall(VecGetBlockSize(v, &bs));
1180: PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
1181: // Derived class can set up the remainder of the data structures here
1182: PetscCall(DerivedCreateIMPLCUPM_Async(*y));
1183: // If the other vector is bound to CPU then the memcpy of the ops struct will give the
1184: // duplicated vector the host "getarray" function which does not lazily allocate the array
1185: // (as it is assumed to always exist). So we force allocation here, before we overwrite the
1186: // ops
1187: if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
1188: // in case the user has done some VecSetOps() tomfoolery
1189: PetscCall(PetscArraycpy((*y)->ops, v->ops, 1));
1190: {
1191: const auto yobj = PetscObjectCast(*y);
1193: PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
1194: PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
1195: }
1196: (*y)->stash.donotstash = v->stash.donotstash;
1197: (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
1198: (*y)->map->bs = std::abs(v->map->bs);
1199: (*y)->bstash.bs = v->bstash.bs;
1200: PetscFunctionReturn(PETSC_SUCCESS);
1201: }
1203: #define VecSetOp_CUPM(op_name, op_host, ...) \
1204: do { \
1205: if (usehost) { \
1206: v->ops->op_name = op_host; \
1207: } else { \
1208: v->ops->op_name = __VA_ARGS__; \
1209: } \
1210: } while (0)
1212: // v->ops->bindtocpu
1213: template <device::cupm::DeviceType T, typename D>
1214: inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
1215: {
1216: const auto change_default_rand_type = [](PetscRandomType target, char **ptr) {
1217: PetscFunctionBegin;
1220: if (std::strcmp(target, *ptr)) {
1221: PetscCall(PetscFree(*ptr));
1222: PetscCall(PetscStrallocpy(target, ptr));
1223: }
1224: PetscFunctionReturn(PETSC_SUCCESS);
1225: };
1227: PetscFunctionBegin;
1228: v->boundtocpu = usehost;
1229: if (usehost) PetscCall(CopyToHost_(dctx, v));
1230: PetscCall(change_default_rand_type(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));
1232: // set the base functions that are guaranteed to be the same for both
1233: v->ops->duplicate = D::duplicate;
1234: v->ops->create = create;
1235: v->ops->destroy = destroy;
1236: v->ops->bindtocpu = D::bindtocpu;
1237: // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
1238: // why, and I don't know how, but it is IMPERATIVE these are set as such!
1239: v->ops->replacearray = replacearray<PETSC_MEMTYPE_HOST>;
1240: v->ops->restorearray = restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;
1242: // set device-only common functions
1243: VecSetOp_CUPM(dotnorm2, nullptr, D::dotnorm2);
1244: VecSetOp_CUPM(getarray, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1245: VecSetOp_CUPM(getarraywrite, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1246: VecSetOp_CUPM(restorearraywrite, nullptr, restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1248: VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1249: VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1251: VecSetOp_CUPM(getarrayandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1252: VecSetOp_CUPM(restorearrayandmemtype, nullptr, restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1254: VecSetOp_CUPM(getarraywriteandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>);
1255: VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });
1257: VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return getarrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
1258: VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1260: // set the functions that are always sequential
1261: using VecSeq_T = VecSeq_CUPM<T>;
1262: VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::scale);
1263: VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::copy);
1264: VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::set);
1265: VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::swap);
1266: VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::axpy);
1267: VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::axpby);
1268: VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::maxpy);
1269: VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::aypx);
1270: VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::waxpy);
1271: VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::axpbypcz);
1272: VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::pointwisemult);
1273: VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::pointwisedivide);
1274: VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::setrandom);
1275: VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::dot);
1276: VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::tdot);
1277: VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::norm);
1278: VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::mdot);
1279: VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::reciprocal);
1280: VecSetOp_CUPM(shift, nullptr, VecSeq_T::shift);
1281: VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1282: VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1283: VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ>);
1284: VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ>);
1285: VecSetOp_CUPM(sum, nullptr, VecSeq_T::sum);
1286: PetscFunctionReturn(PETSC_SUCCESS);
1287: }
1289: // Called from VecGetSubVector()
1290: template <device::cupm::DeviceType T, typename D>
1291: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1292: {
1293: PetscFunctionBegin;
1294: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1295: if (host_array) {
1296: PetscCall(HostAllocateCheck_(dctx, v));
1297: *host_array = VecIMPLCast(v)->array;
1298: }
1299: if (device_array) {
1300: PetscCall(DeviceAllocateCheck_(dctx, v));
1301: *device_array = VecCUPMCast(v)->array_d;
1302: }
1303: if (mask) *mask = v->offloadmask;
1304: PetscFunctionReturn(PETSC_SUCCESS);
1305: }
1307: template <device::cupm::DeviceType T, typename D>
1308: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1309: {
1310: PetscFunctionBegin;
1311: if (const auto vcu = VecCUPMCast(v)) {
1312: cupmStream_t stream;
1313: // clang-format off
1314: const auto cntptrs = util::make_array(
1315: std::ref(vcu->jmap1_d),
1316: std::ref(vcu->perm1_d),
1317: std::ref(vcu->imap2_d),
1318: std::ref(vcu->jmap2_d),
1319: std::ref(vcu->perm2_d),
1320: std::ref(vcu->Cperm_d)
1321: );
1322: // clang-format on
1324: PetscCall(GetHandlesFrom_(dctx, &stream));
1325: for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1326: for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1327: }
1328: PetscFunctionReturn(PETSC_SUCCESS);
1329: }
1331: template <device::cupm::DeviceType T, typename D>
1332: template <std::size_t NCount, std::size_t NScal>
1333: inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1334: {
1335: const auto vimpl = VecIMPLCast(v);
1337: PetscFunctionBegin;
1338: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1339: // need to instantiate the private pointer if not already
1340: PetscCall(VecCUPMAllocateCheck_(v));
1341: {
1342: const auto vcu = VecCUPMCast(v);
1343: // clang-fomat off
1344: const auto cntptrs = util::concat_array(util::make_array(make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1), make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)), extra_cntptrs);
1345: // clang-format on
1346: cupmStream_t stream;
1348: PetscCall(GetHandlesFrom_(dctx, &stream));
1349: // allocate
1350: for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1351: for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1352: // copy
1353: for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1354: for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1355: }
1356: PetscFunctionReturn(PETSC_SUCCESS);
1357: }
1359: #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1360: using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1361: friend name; \
1362: /* introspection */ \
1363: using name::VecCUPMCast; \
1364: using name::VecIMPLCast; \
1365: using name::VECIMPLCUPM; \
1366: using name::VECSEQCUPM; \
1367: using name::VECMPICUPM; \
1368: using name::VecView_Debug; \
1369: /* utility */ \
1370: using typename name::Vec_CUPM; \
1371: using name::UseCUPMHostAlloc; \
1372: using name::GetHandles_; \
1373: using name::GetHandlesFrom_; \
1374: using name::VecCUPMAllocateCheck_; \
1375: using name::VecIMPLAllocateCheck_; \
1376: using name::HostAllocateCheck_; \
1377: using name::DeviceAllocateCheck_; \
1378: using name::CopyToDevice_; \
1379: using name::CopyToHost_; \
1380: using name::create; \
1381: using name::destroy; \
1382: using name::getarray; \
1383: using name::restorearray; \
1384: using name::getarrayandmemtype; \
1385: using name::restorearrayandmemtype; \
1386: using name::placearray; \
1387: using name::replacearray; \
1388: using name::resetarray; \
1389: /* base functions */ \
1390: using name::Create_CUPMBase; \
1391: using name::Initialize_CUPMBase; \
1392: using name::Duplicate_CUPMBase; \
1393: using name::BindToCPU_CUPMBase; \
1394: using name::Create_CUPM; \
1395: using name::DeviceArrayRead; \
1396: using name::DeviceArrayWrite; \
1397: using name::DeviceArrayReadWrite; \
1398: using name::HostArrayRead; \
1399: using name::HostArrayWrite; \
1400: using name::HostArrayReadWrite; \
1401: using name::ResetPreallocationCOO_CUPMBase; \
1402: using name::SetPreallocationCOO_CUPMBase; \
1403: /* blas interface */ \
1404: PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, Tp)
1406: } // namespace impl
1408: } // namespace cupm
1410: } // namespace vec
1412: } // namespace Petsc
1414: #endif // __cplusplus && PetscDefined(HAVE_DEVICE)
1416: #endif // PETSCVECCUPMIMPL_H