Actual source code: veccupmimpl.h

  1: #ifndef PETSCVECCUPMIMPL_H
  2: #define PETSCVECCUPMIMPL_H

  4: #include <petsc/private/vecimpl.h>
  5: #include <../src/vec/vec/impls/dvecimpl.h>

  7: #if PetscDefined(HAVE_NVSHMEM)
  8: PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
  9: PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
 10: PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
 11: PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
 12:   #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
 13: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
 14: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
 15: PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec, NormType, PetscReal *);
 16: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
 17: #else
 18:   #define PetscNvshmemFree(ptr) PETSC_SUCCESS
 19: #endif

 21: #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
 22: #include <petsc/private/deviceimpl.h>
 23: #include <petsc/private/cupmblasinterface.hpp>

 25: #include <petsc/private/cpp/functional.hpp>

 27:   #include <limits>  // std::numeric_limits
 28:   #include <cstring> // std::memset

 30: namespace Petsc
 31: {

 33: namespace vec
 34: {

 36: namespace cupm
 37: {

 39: namespace impl
 40: {

 42: namespace
 43: {

 45: // ==========================================================================================
 46: // UseCUPMHostAlloc_
 47: //
 48: // A simple RAII helper for PetscMallocSet[CUDA|HIP]Host(). it exists because integrating the
 49: // regular versions would be an enormous pain to square with the templated types...
 50: // ==========================================================================================
 51: template <device::cupm::DeviceType T>
 52: class UseCUPMHostAlloc_ : device::cupm::impl::Interface<T> {
 53: public:
 54:   PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(interface_type, T);

 56:   UseCUPMHostAlloc_(bool) noexcept;
 57:   ~UseCUPMHostAlloc_() noexcept;

 59:   PETSC_NODISCARD bool value() const noexcept;

 61: private:
 62:     // would have loved to just do
 63:     //
 64:     // const auto oldmalloc = PetscTrMalloc;
 65:     //
 66:     // but in order to use auto the member needs to be static; in order to be static it must
 67:     // also be constexpr -- which in turn requires an initializer (also implicitly required by
 68:     // auto). But constexpr needs a constant expression initializer, so we can't initialize it
 69:     // with global (mutable) variables...
 70:   #define DECLTYPE_AUTO(left, right) decltype(right) left = right
 71:   const DECLTYPE_AUTO(oldmalloc_, PetscTrMalloc);
 72:   const DECLTYPE_AUTO(oldfree_, PetscTrFree);
 73:   const DECLTYPE_AUTO(oldrealloc_, PetscTrRealloc);
 74:   #undef DECLTYPE_AUTO
 75:   bool v_;
 76: };

 78: template <device::cupm::DeviceType T>
 79: inline UseCUPMHostAlloc_<T>::UseCUPMHostAlloc_(bool useit) noexcept : v_(useit)
 80: {
 81:   PetscFunctionBegin;
 82:   if (useit) {
 83:     // all unused arguments are un-named, this saves having to add PETSC_UNUSED to them all
 84:     PetscTrMalloc = [](std::size_t sz, PetscBool clear, int, const char *, const char *, void **ptr) {
 85:       PetscFunctionBegin;
 86:       PetscCallCUPM(cupmMallocHost(ptr, sz));
 87:       if (clear) std::memset(*ptr, 0, sz);
 88:       PetscFunctionReturn(PETSC_SUCCESS);
 89:     };
 90:     PetscTrFree = [](void *ptr, int, const char *, const char *) {
 91:       PetscFunctionBegin;
 92:       PetscCallCUPM(cupmFreeHost(ptr));
 93:       PetscFunctionReturn(PETSC_SUCCESS);
 94:     };
 95:     PetscTrRealloc = [](std::size_t, int, const char *, const char *, void **) {
 96:       // REVIEW ME: can be implemented by malloc->copy->free?
 97:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "%s has no realloc()", cupmName());
 98:     };
 99:   }
100:   PetscFunctionReturnVoid();
101: }

103: template <device::cupm::DeviceType T>
104: inline bool UseCUPMHostAlloc_<T>::value() const noexcept
105: {
106:   return v_;
107: }

109: template <device::cupm::DeviceType T>
110: inline UseCUPMHostAlloc_<T>::~UseCUPMHostAlloc_() noexcept
111: {
112:   PetscFunctionBegin;
113:   if (value()) {
114:     PetscTrMalloc  = oldmalloc_;
115:     PetscTrFree    = oldfree_;
116:     PetscTrRealloc = oldrealloc_;
117:   }
118:   PetscFunctionReturnVoid();
119: }

121: struct no_op {
122:   template <typename... T>
123:   constexpr PetscErrorCode operator()(T &&...) const noexcept
124:   {
125:     return PETSC_SUCCESS;
126:   }
127: };

129: template <typename T>
130: struct CooPair {
131:   using value_type = T;
132:   using size_type  = PetscCount;

134:   value_type *&device;
135:   value_type *&host;
136:   size_type    size;
137: };

139: template <typename U>
140: static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
141: {
142:   return {device, host, size};
143: }

145: } // anonymous namespace

147: // forward declarations
148: template <device::cupm::DeviceType>
149: class VecSeq_CUPM;
150: template <device::cupm::DeviceType>
151: class VecMPI_CUPM;

153: // ==========================================================================================
154: // Vec_CUPMBase
155: //
156: // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
157: // template parameter it also uses CRTP to be able to use values/calls specific to either
158: // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
159: // ==========================================================================================
160: template <device::cupm::DeviceType T, typename Derived>
161: class Vec_CUPMBase : device::cupm::impl::BlasInterface<T> {
162: public:
163:   PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, T);
164:   // ==========================================================================================
165:   // Vec_CUPMBase::vector_array
166:   //
167:   // RAII versions of the get/restore array routines. Determines constness of the pointer type,
168:   // holds the pointer itself provides the implicit conversion operator
169:   // ==========================================================================================
170:   template <PetscMemType, PetscMemoryAccessMode>
171:   class vector_array;

173: private:
174:   // A debug check to ensure that a given pointer-memtype pairing taken from user-land is
175:   // actually correct. Errors on mismatch
176:   static PetscErrorCode CheckPointerMatchesMemType_(const void *ptr, PetscMemType mtype) noexcept
177:   {
178:     PetscFunctionBegin;
179:     if (PetscDefined(USE_DEBUG) && ptr) {
180:       PetscMemType ptr_mtype;

182:       PetscCall(PetscCUPMGetMemType(ptr, &ptr_mtype));
183:       if (mtype == PETSC_MEMTYPE_HOST) {
184:         PetscCheck(PetscMemTypeHost(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
185:       } else if (mtype == PETSC_MEMTYPE_DEVICE) {
186:         // generic "device" memory should only care if the actual memtype is also generically
187:         // "device"
188:         PetscCheck(PetscMemTypeDevice(ptr_mtype), PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
189:       } else {
190:         PetscCheck(mtype == ptr_mtype, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Pointer %p declared as %s does not match actual memtype %s", ptr, PetscMemTypeToString(mtype), PetscMemTypeToString(ptr_mtype));
191:       }
192:     }
193:     PetscFunctionReturn(PETSC_SUCCESS);
194:   }

196:   // The final stop in the GetHandles_/GetFromHandles_ chain. This retrieves the various
197:   // compute handles and ensure the given PetscDeviceContext is of the right type
198:   static PetscErrorCode GetFromHandleDispatch_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t *) noexcept;
199:   static PetscErrorCode GetHandleDispatch_(PetscDeviceContext *, cupmBlasHandle_t *, cupmStream_t *) noexcept;

201: protected:
202:   static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
203:   {
204:     const auto   pobj  = PetscObjectCast(v);
205:     const auto   vimpl = VecIMPLCast(v);
206:     const auto   vcu   = VecCUPMCast(v);
207:     PetscMemType mtype;
208:     MPI_Comm     comm;

210:     PetscFunctionBegin;
213:     PetscCall(PetscObjectGetComm(pobj, &comm));
214:     PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
215:     PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
216:     PetscCall(PetscPrintf(comm, "Address:             %p\n", v));
217:     PetscCall(PetscPrintf(comm, "Size:                %" PetscInt_FMT "\n", v->map->n));
218:     PetscCall(PetscPrintf(comm, "Offload mask:        %s\n", PetscOffloadMaskToString(v->offloadmask)));
219:     PetscCall(PetscPrintf(comm, "Host ptr:            %p\n", vimpl->array));
220:     PetscCall(PetscPrintf(comm, "Device ptr:          %p\n", vcu->array_d));
221:     PetscCall(PetscPrintf(comm, "Device alloced ptr:  %p\n", vcu->array_allocated_d));
222:     PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
223:     PetscCall(PetscPrintf(comm, "dptr is device mem?  %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
224:     PetscFunctionReturn(PETSC_SUCCESS);
225:   }

227:   // Helper routines to retrieve various combinations of handles. The first set (GetHandles_)
228:   // gets a PetscDeviceContext along with it, while the second set (GetHandlesFrom_) assumes
229:   // you've gotten the PetscDeviceContext already, and retrieves the handles from it. All of
230:   // them check that the PetscDeviceContext is of the appropriate type
231:   static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmBlasHandle_t * = nullptr, cupmStream_t * = nullptr) noexcept;
232:   static PetscErrorCode GetHandles_(PetscDeviceContext *, cupmStream_t *) noexcept;
233:   static PetscErrorCode GetHandles_(cupmStream_t *) noexcept;
234:   static PetscErrorCode GetHandles_(cupmBlasHandle_t *) noexcept;

236:   static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmBlasHandle_t *, cupmStream_t * = nullptr) noexcept;
237:   static PetscErrorCode GetHandlesFrom_(PetscDeviceContext, cupmStream_t *) noexcept;

239:   // Delete the allocated device array if required and replace it with the given array
240:   static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
241:   // Check either the host or device impl pointer is allocated and allocate it if
242:   // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
243:   template <typename CastFunctionType>
244:   static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
245:   // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
246:   static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
247:   // Check the Host part (v->data) is allocated, otherwise allocate it
248:   static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
249:   // Check the Host array is allocated, otherwise allocate it
250:   static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
251:   // Check the CUPM array is allocated, otherwise allocate it
252:   static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
253:   // Copy HTOD, allocating device if necessary
254:   static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
255:   // Copy DTOH, allocating host if necessary
256:   static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;

258: public:
259:   struct Vec_CUPM {
260:     PetscScalar *array_d;           // gpu data
261:     PetscScalar *array_allocated_d; // does PETSc own the array ptr?
262:     PetscBool    nvshmem;           // is array allocated in nvshmem? It is used to allocate
263:                                     // Mvctx->lvec in nvshmem

265:     // COO stuff
266:     PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
267:                          // in COO arrays
268:     PetscCount *perm1_d; // [tot1]: permutation array for local entries
269:     PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
270:                          // the vector
271:     PetscCount *jmap2_d; // [nnz2+1]
272:     PetscCount *perm2_d; // [recvlen]
273:     PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
274:                          // communication

276:     // Buffers for remote values in VecSetValuesCOO()
277:     PetscScalar *sendbuf_d;
278:     PetscScalar *recvbuf_d;
279:   };

281:   // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
282:   PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
283:   // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
284:   template <typename U = Derived>
285:   PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
286:   // Get the PetscLogEvents for HTOD and DTOH
287:   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
288:   PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
289:   // Get the VecTypes
290:   PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
291:   PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
292:   // Get the VecType of the calling vector
293:   template <typename U = Derived>
294:   PETSC_NODISCARD static constexpr VecType         VECIMPLCUPM() noexcept;
295:   PETSC_NODISCARD static constexpr PetscRandomType PETSCDEVICERAND() noexcept;

297:   // Call the host destroy function, i.e. VecDestroy_Seq()
298:   static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
299:   // Call the host reset function, i.e. VecResetArray_Seq()
300:   static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
301:   // ... you get the idea
302:   static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
303:   // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
304:   // along with it if needed
305:   static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;

307:   // Shorthand for creating vector_array's. Need functions to create them, otherwise using them
308:   // as an unnamed temporary leads to most vexing parse
309:   PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
310:   PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
311:   PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
312:   PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
313:   PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
314:   PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(vector_array<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});

316:   // disallow implicit conversion
317:   template <typename U>
318:   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(U) noexcept = delete;
319:   // utility for using cupmHostAlloc()
320:   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(bool) noexcept;
321:   PETSC_NODISCARD static UseCUPMHostAlloc_<T> UseCUPMHostAlloc(PetscBool) noexcept;

323:   // ops-table functions
324:   static PetscErrorCode create(Vec) noexcept;
325:   static PetscErrorCode destroy(Vec) noexcept;
326:   template <PetscMemType, PetscMemoryAccessMode, bool = false>
327:   static PetscErrorCode getarray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
328:   template <PetscMemType, PetscMemoryAccessMode, bool = false>
329:   static PetscErrorCode getarray(Vec, PetscScalar **) noexcept;
330:   template <PetscMemType, PetscMemoryAccessMode>
331:   static PetscErrorCode restorearray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
332:   template <PetscMemType, PetscMemoryAccessMode>
333:   static PetscErrorCode restorearray(Vec, PetscScalar **) noexcept;
334:   template <PetscMemoryAccessMode>
335:   static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
336:   template <PetscMemoryAccessMode>
337:   static PetscErrorCode getarrayandmemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
338:   template <PetscMemoryAccessMode>
339:   static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
340:   template <PetscMemoryAccessMode>
341:   static PetscErrorCode restorearrayandmemtype(Vec, PetscScalar **) noexcept;
342:   template <PetscMemType>
343:   static PetscErrorCode replacearray(Vec, const PetscScalar *) noexcept;
344:   template <PetscMemType>
345:   static PetscErrorCode resetarray(Vec) noexcept;
346:   template <PetscMemType>
347:   static PetscErrorCode placearray(Vec, const PetscScalar *) noexcept;

349:   // common ops shared between Seq and MPI
350:   static PetscErrorCode Create_CUPM(Vec) noexcept;
351:   static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
352:   static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
353:   template <typename SetupFunctionT = no_op>
354:   static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
355:   static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
356:   static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
357:   static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
358:   template <std::size_t NCount = 0, std::size_t NScal = 0>
359:   static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
360: };

362: // ==========================================================================================
363: // Vec_CUPMBase::vector_array
364: //
365: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
366: // holds the pointer itself and provides the implicit conversion operator.
367: //
368: // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
369: // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
370: // for you
371: // ==========================================================================================
372: template <device::cupm::DeviceType T, typename D>
373: template <PetscMemType MT, PetscMemoryAccessMode MA>
374: class Vec_CUPMBase<T, D>::vector_array {
375: public:
376:   static const auto memory_type = MT;
377:   static const auto access_type = MA;

379:   using value_type        = PetscScalar;
380:   using pointer_type      = value_type *;
381:   using cupm_pointer_type = cupmScalar_t *;

383:   vector_array(PetscDeviceContext, Vec) noexcept;
384:   ~vector_array() noexcept;

386:   constexpr vector_array(vector_array &&) noexcept            = default;
387:   constexpr vector_array &operator=(vector_array &&) noexcept = default;

389:   pointer_type      data() const noexcept;
390:   cupm_pointer_type cupmdata() const noexcept;

392:   operator pointer_type() const noexcept;
393:   // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
394:   // we make a dummy template parameter to allow SFINAE to nix it for us
395:   template <typename U = pointer_type, typename = util::enable_if_t<!std::is_same<U, cupm_pointer_type>::value>>
396:   operator cupm_pointer_type() const noexcept;

398: private:
399:   pointer_type       ptr_  = nullptr;
400:   PetscDeviceContext dctx_ = nullptr;
401:   Vec                v_    = nullptr;
402: };

404: // ==========================================================================================
405: // Vec_CUPMBase::vector_array - Static Variables
406: // ==========================================================================================

408: template <device::cupm::DeviceType T, typename D>
409: template <PetscMemType MT, PetscMemoryAccessMode MA>
410: const PetscMemType Vec_CUPMBase<T, D>::vector_array<MT, MA>::memory_type;

412: template <device::cupm::DeviceType T, typename D>
413: template <PetscMemType MT, PetscMemoryAccessMode MA>
414: const PetscMemoryAccessMode Vec_CUPMBase<T, D>::vector_array<MT, MA>::access_type;

416: // ==========================================================================================
417: // Vec_CUPMBase::vector_array - Public API
418: // ==========================================================================================

420: template <device::cupm::DeviceType T, typename D>
421: template <PetscMemType MT, PetscMemoryAccessMode MA>
422: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::vector_array(PetscDeviceContext dctx, Vec v) noexcept : dctx_(dctx), v_(v)
423: {
424:   PetscFunctionBegin;
425:   PetscCallAbort(PETSC_COMM_SELF, getarray<MT, MA, true>(v, &ptr_, dctx));
426:   PetscFunctionReturnVoid();
427: }

429: template <device::cupm::DeviceType T, typename D>
430: template <PetscMemType MT, PetscMemoryAccessMode MA>
431: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::~vector_array() noexcept
432: {
433:   PetscFunctionBegin;
434:   PetscCallAbort(PETSC_COMM_SELF, restorearray<MT, MA>(v_, &ptr_, dctx_));
435:   PetscFunctionReturnVoid();
436: }

438: template <device::cupm::DeviceType T, typename D>
439: template <PetscMemType MT, PetscMemoryAccessMode MA>
440: inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::data() const noexcept
441: {
442:   return ptr_;
443: }

445: template <device::cupm::DeviceType T, typename D>
446: template <PetscMemType MT, PetscMemoryAccessMode MA>
447: inline typename Vec_CUPMBase<T, D>::template vector_array<MT, MA>::cupm_pointer_type Vec_CUPMBase<T, D>::vector_array<MT, MA>::cupmdata() const noexcept
448: {
449:   return cupmScalarPtrCast(data());
450: }

452: template <device::cupm::DeviceType T, typename D>
453: template <PetscMemType MT, PetscMemoryAccessMode MA>
454: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator pointer_type() const noexcept
455: {
456:   return data();
457: }

459: // in case pointer_type == cupmscalar_pointer_type we don't want this overload to exist, so
460: // we make a dummy template parameter to allow SFINAE to nix it for us
461: template <device::cupm::DeviceType T, typename D>
462: template <PetscMemType MT, PetscMemoryAccessMode MA>
463: template <typename U, typename>
464: inline Vec_CUPMBase<T, D>::vector_array<MT, MA>::operator cupm_pointer_type() const noexcept
465: {
466:   return cupmdata();
467: }

469: // ==========================================================================================
470: // Vec_CUPMBase - Private API
471: // ==========================================================================================

473: template <device::cupm::DeviceType T, typename D>
474: inline PetscErrorCode Vec_CUPMBase<T, D>::GetFromHandleDispatch_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
475: {
476:   PetscFunctionBegin;
480:   if (PetscDefined(USE_DEBUG)) {
481:     PetscDeviceType dtype;

483:     PetscCall(PetscDeviceContextGetDeviceType(dctx, &dtype));
484:     PetscCheckCompatibleDeviceTypes(PETSC_DEVICE_CUPM(), -1, dtype, 1);
485:   }
486:   if (handle) PetscCall(PetscDeviceContextGetBLASHandle_Internal(dctx, handle));
487:   if (stream) PetscCall(PetscDeviceContextGetStreamHandle_Internal(dctx, stream));
488:   PetscFunctionReturn(PETSC_SUCCESS);
489: }

491: template <device::cupm::DeviceType T, typename D>
492: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandleDispatch_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
493: {
494:   PetscDeviceContext dctx_loc = nullptr;

496:   PetscFunctionBegin;
497:   // silence uninitialized variable warnings
498:   if (dctx) *dctx = nullptr;
499:   PetscCall(PetscDeviceContextGetCurrentContext(&dctx_loc));
500:   PetscCall(GetFromHandleDispatch_(dctx_loc, handle, stream));
501:   if (dctx) *dctx = dctx_loc;
502:   PetscFunctionReturn(PETSC_SUCCESS);
503: }

505: // ==========================================================================================
506: // Vec_CUPMBase - Protected API
507: // ==========================================================================================

509: template <device::cupm::DeviceType T, typename D>
510: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
511: {
512:   return GetHandleDispatch_(dctx, handle, stream);
513: }

515: template <device::cupm::DeviceType T, typename D>
516: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(PetscDeviceContext *dctx, cupmStream_t *stream) noexcept
517: {
518:   return GetHandles_(dctx, nullptr, stream);
519: }

521: template <device::cupm::DeviceType T, typename D>
522: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmStream_t *stream) noexcept
523: {
524:   return GetHandles_(nullptr, stream);
525: }

527: template <device::cupm::DeviceType T, typename D>
528: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandles_(cupmBlasHandle_t *handle) noexcept
529: {
530:   return GetHandles_(nullptr, handle);
531: }

533: template <device::cupm::DeviceType T, typename D>
534: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmBlasHandle_t *handle, cupmStream_t *stream) noexcept
535: {
536:   return GetFromHandleDispatch_(dctx, handle, stream);
537: }

539: template <device::cupm::DeviceType T, typename D>
540: inline PetscErrorCode Vec_CUPMBase<T, D>::GetHandlesFrom_(PetscDeviceContext dctx, cupmStream_t *stream) noexcept
541: {
542:   return GetHandlesFrom_(dctx, nullptr, stream);
543: }

545: template <device::cupm::DeviceType T, typename D>
546: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
547: {
548:   auto &device_array = VecCUPMCast(v)->array_allocated_d;

550:   PetscFunctionBegin;
551:   if (device_array) {
552:     if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
553:       PetscCall(PetscNvshmemFree(device_array));
554:     } else {
555:       cupmStream_t stream;

557:       PetscCall(GetHandlesFrom_(dctx, &stream));
558:       PetscCallCUPM(cupmFreeAsync(device_array, stream));
559:     }
560:   }
561:   device_array = new_value;
562:   PetscFunctionReturn(PETSC_SUCCESS);
563: }

565: namespace
566: {

568: inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v) noexcept
569: {
570:   auto      mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
571:   PetscBool flg;

573:   PetscFunctionBegin;
574:   PetscObjectOptionsBegin(PetscObjectCast(v));
575:   PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
576:   if (flg) v->minimum_bytes_pinned_memory = mem;
577:   PetscOptionsEnd();
578:   PetscFunctionReturn(PETSC_SUCCESS);
579: }

581: } // anonymous namespace

583: template <device::cupm::DeviceType T, typename D>
584: template <typename CastFunctionType>
585: inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
586: {
587:   PetscFunctionBegin;
588:   if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
589:   // do the check here so we don't have to do it in every function
590:   PetscCall(checkCupmBlasIntCast(v->map->n));
591:   {
592:     auto impl = cast(v);

594:     PetscCall(PetscNew(&impl));
595:     dest = impl;
596:   }
597:   PetscFunctionReturn(PETSC_SUCCESS);
598: }

600: template <device::cupm::DeviceType T, typename D>
601: inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
602: {
603:   PetscFunctionBegin;
604:   PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
605:   PetscFunctionReturn(PETSC_SUCCESS);
606: }

608: // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
609: // certain circumstances (such as when the user places the device array) we do not want to do
610: // the full DeviceAllocateCheck_() as it also allocates the array
611: template <device::cupm::DeviceType T, typename D>
612: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
613: {
614:   PetscFunctionBegin;
615:   PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
616:   PetscFunctionReturn(PETSC_SUCCESS);
617: }

619: template <device::cupm::DeviceType T, typename D>
620: inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
621: {
622:   PetscFunctionBegin;
623:   PetscCall(VecIMPLAllocateCheck_(v));
624:   if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
625:   else {
626:     PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
627:     {
628:       const auto n     = v->map->n;
629:       const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);

631:       v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
632:       PetscCall(PetscMalloc1(n, &alloc));
633:     }
634:     if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
635:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
636:   }
637:   PetscFunctionReturn(PETSC_SUCCESS);
638: }

640: template <device::cupm::DeviceType T, typename D>
641: inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
642: {
643:   PetscFunctionBegin;
644:   PetscCall(VecCUPMAllocateCheck_(v));
645:   if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
646:   else {
647:     const auto   n                 = v->map->n;
648:     auto        &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
649:     cupmStream_t stream;

651:     PetscCall(GetHandlesFrom_(dctx, &stream));
652:     PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
653:     alloc = array_allocated_d;
654:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
655:       const auto vimp = VecIMPLCast(v);
656:       v->offloadmask  = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
657:     }
658:   }
659:   PetscFunctionReturn(PETSC_SUCCESS);
660: }

662: template <device::cupm::DeviceType T, typename D>
663: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
664: {
665:   PetscFunctionBegin;
666:   PetscCall(DeviceAllocateCheck_(dctx, v));
667:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
668:     cupmStream_t stream;

670:     v->offloadmask = PETSC_OFFLOAD_BOTH;
671:     PetscCall(GetHandlesFrom_(dctx, &stream));
672:     PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
673:     PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
674:     PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
675:   }
676:   PetscFunctionReturn(PETSC_SUCCESS);
677: }

679: template <device::cupm::DeviceType T, typename D>
680: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
681: {
682:   PetscFunctionBegin;
683:   PetscCall(HostAllocateCheck_(dctx, v));
684:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
685:     cupmStream_t stream;

687:     v->offloadmask = PETSC_OFFLOAD_BOTH;
688:     PetscCall(GetHandlesFrom_(dctx, &stream));
689:     PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
690:     PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
691:     PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
692:   }
693:   PetscFunctionReturn(PETSC_SUCCESS);
694: }

696: // ==========================================================================================
697: // Vec_CUPMBase - Public API
698: // ==========================================================================================

700: template <device::cupm::DeviceType T, typename D>
701: inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
702: {
703:   return static_cast<Vec_CUPM *>(v->spptr);
704: }

706: // This is a trick to get around the fact that in CRTP the derived class is not yet fully
707: // defined because Base<Derived> must necessarily be instantiated before Derived is
708: // complete. By using a dummy template parameter we make the type "dependent" and so will
709: // only be determined when the derived class is instantiated (and therefore fully defined)
710: template <device::cupm::DeviceType T, typename D>
711: template <typename U>
712: inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
713: {
714:   return U::VecIMPLCast_(v);
715: }

717: template <device::cupm::DeviceType T, typename D>
718: inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
719: {
720:   return D::VecDestroy_IMPL_(v);
721: }

723: template <device::cupm::DeviceType T, typename D>
724: inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
725: {
726:   return D::VecResetArray_IMPL_(v);
727: }

729: template <device::cupm::DeviceType T, typename D>
730: inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
731: {
732:   return D::VecPlaceArray_IMPL_(v, a);
733: }

735: template <device::cupm::DeviceType T, typename D>
736: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
737: {
738:   return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
739: }

741: template <device::cupm::DeviceType T, typename D>
742: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
743: {
744:   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
745: }

747: template <device::cupm::DeviceType T, typename D>
748: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
749: {
750:   return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
751: }

753: template <device::cupm::DeviceType T, typename D>
754: inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
755: {
756:   return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
757: }

759: template <device::cupm::DeviceType T, typename D>
760: inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
761: {
762:   return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
763: }

765: template <device::cupm::DeviceType T, typename D>
766: template <typename U>
767: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
768: {
769:   return U::VECIMPLCUPM_();
770: }

772: template <device::cupm::DeviceType T, typename D>
773: inline constexpr PetscRandomType Vec_CUPMBase<T, D>::PETSCDEVICERAND() noexcept
774: {
775:   // REVIEW ME: HIP default rng?
776:   return T == device::cupm::DeviceType::CUDA ? PETSCCURAND : PETSCRANDER48;
777: }

779: // utility for using cupmHostAlloc()
780: template <device::cupm::DeviceType T, typename D>
781: inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(bool b) noexcept
782: {
783:   return {b};
784: }

786: template <device::cupm::DeviceType T, typename D>
787: inline UseCUPMHostAlloc_<T> Vec_CUPMBase<T, D>::UseCUPMHostAlloc(PetscBool b) noexcept
788: {
789:   return UseCUPMHostAlloc(static_cast<bool>(b));
790: }

792: // private version that takes a PetscDeviceContext, called by the public variant
793: template <device::cupm::DeviceType T, typename D>
794: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
795: inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
796: {
797:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
798:   const auto     oldmask     = v->offloadmask;
799:   auto          &mask        = v->offloadmask;
800:   auto           should_sync = false;

802:   PetscFunctionBegin;
803:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
804:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
805:   if (PetscMemoryAccessRead(access)) {
806:     // READ or READ_WRITE
807:     if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
808:       // if we move the data we should set the flag to synchronize later on
809:       should_sync = true;
810:     }
811:     PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
812:   } else {
813:     // WRITE only
814:     PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
815:   }
816:   *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
817:   // if unallocated previously we should zero things out if we intend to read
818:   if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
819:     const auto n = v->map->n;

821:     if (hostmem) {
822:       PetscCall(PetscArrayzero(*a, n));
823:     } else {
824:       cupmStream_t stream;

826:       PetscCall(GetHandlesFrom_(dctx, &stream));
827:       PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
828:       should_sync = true;
829:     }
830:   }
831:   // update the offloadmask if we intend to write, since we assume immediately modified
832:   if (PetscMemoryAccessWrite(access)) {
833:     PetscCall(VecSetErrorIfLocked(v, 1));
834:     // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
835:     // is immediately modified
836:     mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
837:   }
838:   // if we are a globally blocking stream and we have MOVED data then we should synchronize,
839:   // since even doing async calls on the NULL stream is not synchronous
840:   if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
841:   PetscFunctionReturn(PETSC_SUCCESS);
842: }

844: // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
845: template <device::cupm::DeviceType T, typename D>
846: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
847: inline PetscErrorCode Vec_CUPMBase<T, D>::getarray(Vec v, PetscScalar **a) noexcept
848: {
849:   PetscDeviceContext dctx;

851:   PetscFunctionBegin;
852:   PetscCall(GetHandles_(&dctx));
853:   PetscCall(getarray<mtype, access, force>(v, a, dctx));
854:   PetscFunctionReturn(PETSC_SUCCESS);
855: }

857: // private version that takes a PetscDeviceContext, called by the public variant
858: template <device::cupm::DeviceType T, typename D>
859: template <PetscMemType mtype, PetscMemoryAccessMode access>
860: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
861: {
862:   PetscFunctionBegin;
863:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
864:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
865:   if (PetscMemoryAccessWrite(access)) {
866:     // WRITE or READ_WRITE
867:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
868:     v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
869:   }
870:   if (a) {
871:     PetscCall(CheckPointerMatchesMemType_(*a, mtype));
872:     *a = nullptr;
873:   }
874:   PetscFunctionReturn(PETSC_SUCCESS);
875: }

877: // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
878: template <device::cupm::DeviceType T, typename D>
879: template <PetscMemType mtype, PetscMemoryAccessMode access>
880: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearray(Vec v, PetscScalar **a) noexcept
881: {
882:   PetscDeviceContext dctx;

884:   PetscFunctionBegin;
885:   PetscCall(GetHandles_(&dctx));
886:   PetscCall(restorearray<mtype, access>(v, a, dctx));
887:   PetscFunctionReturn(PETSC_SUCCESS);
888: }

890: template <device::cupm::DeviceType T, typename D>
891: template <PetscMemoryAccessMode access>
892: inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
893: {
894:   PetscFunctionBegin;
895:   PetscCall(getarray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
896:   if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
897:   PetscFunctionReturn(PETSC_SUCCESS);
898: }

900: // v->ops->getarrayandmemtype
901: template <device::cupm::DeviceType T, typename D>
902: template <PetscMemoryAccessMode access>
903: inline PetscErrorCode Vec_CUPMBase<T, D>::getarrayandmemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
904: {
905:   PetscDeviceContext dctx;

907:   PetscFunctionBegin;
908:   PetscCall(GetHandles_(&dctx));
909:   PetscCall(getarrayandmemtype<access>(v, a, mtype, dctx));
910:   PetscFunctionReturn(PETSC_SUCCESS);
911: }

913: template <device::cupm::DeviceType T, typename D>
914: template <PetscMemoryAccessMode access>
915: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
916: {
917:   PetscFunctionBegin;
918:   PetscCall(restorearray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
919:   PetscFunctionReturn(PETSC_SUCCESS);
920: }

922: // v->ops->restorearrayandmemtype
923: template <device::cupm::DeviceType T, typename D>
924: template <PetscMemoryAccessMode access>
925: inline PetscErrorCode Vec_CUPMBase<T, D>::restorearrayandmemtype(Vec v, PetscScalar **a) noexcept
926: {
927:   PetscDeviceContext dctx;

929:   PetscFunctionBegin;
930:   PetscCall(GetHandles_(&dctx));
931:   PetscCall(restorearrayandmemtype<access>(v, a, dctx));
932:   PetscFunctionReturn(PETSC_SUCCESS);
933: }

935: // v->ops->placearray or VecCUPMPlaceArray()
936: template <device::cupm::DeviceType T, typename D>
937: template <PetscMemType mtype>
938: inline PetscErrorCode Vec_CUPMBase<T, D>::placearray(Vec v, const PetscScalar *a) noexcept
939: {
940:   PetscDeviceContext dctx;

942:   PetscFunctionBegin;
943:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
944:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
945:   PetscCall(CheckPointerMatchesMemType_(a, mtype));
946:   PetscCall(GetHandles_(&dctx));
947:   if (PetscMemTypeHost(mtype)) {
948:     PetscCall(CopyToHost_(dctx, v));
949:     PetscCall(VecPlaceArray_IMPL(v, a));
950:     v->offloadmask = PETSC_OFFLOAD_CPU;
951:   } else {
952:     PetscCall(VecIMPLAllocateCheck_(v));
953:     {
954:       auto &backup_array = VecIMPLCast(v)->unplacedarray;

956:       PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
957:       PetscCall(CopyToDevice_(dctx, v));
958:       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
959:       backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
960:       // only update the offload mask if we actually assign a pointer
961:       if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
962:     }
963:   }
964:   PetscFunctionReturn(PETSC_SUCCESS);
965: }

967: // v->ops->replacearray or VecCUPMReplaceArray()
968: template <device::cupm::DeviceType T, typename D>
969: template <PetscMemType mtype>
970: inline PetscErrorCode Vec_CUPMBase<T, D>::replacearray(Vec v, const PetscScalar *a) noexcept
971: {
972:   const auto         aptr = const_cast<PetscScalar *>(a);
973:   PetscDeviceContext dctx;

975:   PetscFunctionBegin;
976:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
977:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
978:   PetscCall(CheckPointerMatchesMemType_(a, mtype));
979:   PetscCall(GetHandles_(&dctx));
980:   if (PetscMemTypeHost(mtype)) {
981:     PetscCall(VecIMPLAllocateCheck_(v));
982:     {
983:       const auto vimpl      = VecIMPLCast(v);
984:       auto      &host_array = vimpl->array_allocated;

986:       // make sure the users array has the latest values.
987:       // REVIEW ME: why? we're about to free it
988:       if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
989:       if (host_array) {
990:         const auto useit = UseCUPMHostAlloc(v->pinned_memory);

992:         PetscCall(PetscFree(host_array));
993:       }
994:       host_array       = aptr;
995:       vimpl->array     = host_array;
996:       v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
997:       v->offloadmask   = PETSC_OFFLOAD_CPU;
998:     }
999:   } else {
1000:     PetscCall(VecCUPMAllocateCheck_(v));
1001:     {
1002:       const auto vcu = VecCUPMCast(v);

1004:       PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
1005:       // don't update the offloadmask if placed pointer is NULL
1006:       vcu->array_d = vcu->array_allocated_d /* = aptr */;
1007:       if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
1008:     }
1009:   }
1010:   PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1011:   PetscFunctionReturn(PETSC_SUCCESS);
1012: }

1014: // v->ops->resetarray or VecCUPMResetArray()
1015: template <device::cupm::DeviceType T, typename D>
1016: template <PetscMemType mtype>
1017: inline PetscErrorCode Vec_CUPMBase<T, D>::resetarray(Vec v) noexcept
1018: {
1019:   PetscDeviceContext dctx;

1021:   PetscFunctionBegin;
1022:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1023:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1024:   PetscCall(GetHandles_(&dctx));
1025:   // REVIEW ME:
1026:   // this is wildly inefficient but must be done if we assume that the placed array must have
1027:   // correct values
1028:   if (PetscMemTypeHost(mtype)) {
1029:     PetscCall(CopyToHost_(dctx, v));
1030:     PetscCall(VecResetArray_IMPL(v));
1031:     v->offloadmask = PETSC_OFFLOAD_CPU;
1032:   } else {
1033:     PetscCall(VecIMPLAllocateCheck_(v));
1034:     PetscCall(VecCUPMAllocateCheck_(v));
1035:     {
1036:       const auto vcu        = VecCUPMCast(v);
1037:       const auto vimpl      = VecIMPLCast(v);
1038:       auto      &host_array = vimpl->unplacedarray;

1040:       PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
1041:       PetscCall(CopyToDevice_(dctx, v));
1042:       PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
1043:       // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
1044:       // otherwise check if the host has a valid pointer. If neither, then we are not
1045:       // allocated.
1046:       vcu->array_d = host_array;
1047:       if (host_array) {
1048:         host_array     = nullptr;
1049:         v->offloadmask = PETSC_OFFLOAD_GPU;
1050:       } else if (vimpl->array) {
1051:         v->offloadmask = PETSC_OFFLOAD_CPU;
1052:       } else {
1053:         v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1054:       }
1055:     }
1056:   }
1057:   PetscFunctionReturn(PETSC_SUCCESS);
1058: }

1060: // v->ops->create
1061: template <device::cupm::DeviceType T, typename D>
1062: inline PetscErrorCode Vec_CUPMBase<T, D>::create(Vec v) noexcept
1063: {
1064:   PetscBool          alloc_missing;
1065:   PetscDeviceContext dctx;

1067:   PetscFunctionBegin;
1068:   PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
1069:   PetscCall(GetHandles_(&dctx));
1070:   PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
1071:   PetscFunctionReturn(PETSC_SUCCESS);
1072: }

1074: // v->ops->destroy
1075: template <device::cupm::DeviceType T, typename D>
1076: inline PetscErrorCode Vec_CUPMBase<T, D>::destroy(Vec v) noexcept
1077: {
1078:   PetscFunctionBegin;
1079:   if (const auto vcu = VecCUPMCast(v)) {
1080:     PetscDeviceContext dctx;

1082:     PetscCall(GetHandles_(&dctx));
1083:     PetscCall(ResetAllocatedDevicePtr_(dctx, v));
1084:     PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1085:     PetscCall(PetscFree(v->spptr));
1086:   }
1087:   PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
1088:   if (const auto vimpl = VecIMPLCast(v)) {
1089:     if (auto &array_allocated = vimpl->array_allocated) {
1090:       const auto useit = UseCUPMHostAlloc(v->pinned_memory);

1092:       // do this ourselves since we may want to use the cupm functions
1093:       PetscCall(PetscFree(array_allocated));
1094:     }
1095:   }
1096:   v->pinned_memory = PETSC_FALSE;
1097:   PetscCall(VecDestroy_IMPL(v));
1098:   PetscFunctionReturn(PETSC_SUCCESS);
1099: }

1101: // ================================================================================== //
1102: //                      Common core between Seq and MPI                               //

1104: // VecCreate_CUPM()
1105: template <device::cupm::DeviceType T, typename D>
1106: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
1107: {
1108:   PetscMPIInt size;

1110:   PetscFunctionBegin;
1111:   PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
1112:   PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
1113:   PetscFunctionReturn(PETSC_SUCCESS);
1114: }

1116: // VecCreateCUPM()
1117: template <device::cupm::DeviceType T, typename D>
1118: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
1119: {
1120:   PetscFunctionBegin;
1121:   PetscCall(VecCreate(comm, v));
1122:   if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
1123:   PetscCall(VecSetSizes(*v, n, N));
1124:   if (bs) PetscCall(VecSetBlockSize(*v, bs));
1125:   if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
1126:   PetscFunctionReturn(PETSC_SUCCESS);
1127: }

1129: // VecCreateIMPL_CUPM(), called through v->ops->create
1130: template <device::cupm::DeviceType T, typename D>
1131: inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
1132: {
1133:   PetscFunctionBegin;
1134:   // REVIEW ME: perhaps not needed
1135:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
1136:   PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
1137:   PetscCall(D::bindtocpu(v, PETSC_FALSE));
1138:   if (device_array) {
1139:     PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
1140:     PetscCall(VecCUPMAllocateCheck_(v));
1141:     VecCUPMCast(v)->array_d = device_array;
1142:   }
1143:   if (host_array) {
1144:     PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
1145:     VecIMPLCast(v)->array = host_array;
1146:   }
1147:   if (allocate_missing) {
1148:     PetscCall(DeviceAllocateCheck_(dctx, v));
1149:     PetscCall(HostAllocateCheck_(dctx, v));
1150:     // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
1151:     // set() for reference
1152:     // calls device-version
1153:     PetscCall(VecSet(v, 0));
1154:     // zero the host while device is underway
1155:     PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
1156:     v->offloadmask = PETSC_OFFLOAD_BOTH;
1157:   } else {
1158:     if (host_array) {
1159:       v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
1160:     } else {
1161:       v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
1162:     }
1163:   }
1164:   PetscFunctionReturn(PETSC_SUCCESS);
1165: }

1167: // v->ops->duplicate
1168: template <device::cupm::DeviceType T, typename D>
1169: template <typename SetupFunctionT>
1170: inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
1171: {
1172:   // if the derived setup is the default no_op then we should call VecSetType()
1173:   constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
1174:   const auto     vobj          = PetscObjectCast(v);
1175:   const auto     map           = v->map;
1176:   PetscInt       bs;

1178:   PetscFunctionBegin;
1179:   PetscCall(VecGetBlockSize(v, &bs));
1180:   PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
1181:   // Derived class can set up the remainder of the data structures here
1182:   PetscCall(DerivedCreateIMPLCUPM_Async(*y));
1183:   // If the other vector is bound to CPU then the memcpy of the ops struct will give the
1184:   // duplicated vector the host "getarray" function which does not lazily allocate the array
1185:   // (as it is assumed to always exist). So we force allocation here, before we overwrite the
1186:   // ops
1187:   if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
1188:   // in case the user has done some VecSetOps() tomfoolery
1189:   PetscCall(PetscArraycpy((*y)->ops, v->ops, 1));
1190:   {
1191:     const auto yobj = PetscObjectCast(*y);

1193:     PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
1194:     PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
1195:   }
1196:   (*y)->stash.donotstash   = v->stash.donotstash;
1197:   (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
1198:   (*y)->map->bs            = std::abs(v->map->bs);
1199:   (*y)->bstash.bs          = v->bstash.bs;
1200:   PetscFunctionReturn(PETSC_SUCCESS);
1201: }

1203:   #define VecSetOp_CUPM(op_name, op_host, ...) \
1204:     do { \
1205:       if (usehost) { \
1206:         v->ops->op_name = op_host; \
1207:       } else { \
1208:         v->ops->op_name = __VA_ARGS__; \
1209:       } \
1210:     } while (0)

1212: // v->ops->bindtocpu
1213: template <device::cupm::DeviceType T, typename D>
1214: inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
1215: {
1216:   const auto change_default_rand_type = [](PetscRandomType target, char **ptr) {
1217:     PetscFunctionBegin;
1220:     if (std::strcmp(target, *ptr)) {
1221:       PetscCall(PetscFree(*ptr));
1222:       PetscCall(PetscStrallocpy(target, ptr));
1223:     }
1224:     PetscFunctionReturn(PETSC_SUCCESS);
1225:   };

1227:   PetscFunctionBegin;
1228:   v->boundtocpu = usehost;
1229:   if (usehost) PetscCall(CopyToHost_(dctx, v));
1230:   PetscCall(change_default_rand_type(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));

1232:   // set the base functions that are guaranteed to be the same for both
1233:   v->ops->duplicate = D::duplicate;
1234:   v->ops->create    = create;
1235:   v->ops->destroy   = destroy;
1236:   v->ops->bindtocpu = D::bindtocpu;
1237:   // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
1238:   // why, and I don't know how, but it is IMPERATIVE these are set as such!
1239:   v->ops->replacearray = replacearray<PETSC_MEMTYPE_HOST>;
1240:   v->ops->restorearray = restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;

1242:   // set device-only common functions
1243:   VecSetOp_CUPM(dotnorm2, nullptr, D::dotnorm2);
1244:   VecSetOp_CUPM(getarray, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1245:   VecSetOp_CUPM(getarraywrite, nullptr, getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1246:   VecSetOp_CUPM(restorearraywrite, nullptr, restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);

1248:   VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return getarray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1249:   VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return restorearray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });

1251:   VecSetOp_CUPM(getarrayandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1252:   VecSetOp_CUPM(restorearrayandmemtype, nullptr, restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);

1254:   VecSetOp_CUPM(getarraywriteandmemtype, nullptr, getarrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>);
1255:   VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });

1257:   VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return getarrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
1258:   VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return restorearrayandmemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });

1260:   // set the functions that are always sequential
1261:   using VecSeq_T = VecSeq_CUPM<T>;
1262:   VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::scale);
1263:   VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::copy);
1264:   VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::set);
1265:   VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::swap);
1266:   VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::axpy);
1267:   VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::axpby);
1268:   VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::maxpy);
1269:   VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::aypx);
1270:   VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::waxpy);
1271:   VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::axpbypcz);
1272:   VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::pointwisemult);
1273:   VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::pointwisedivide);
1274:   VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::setrandom);
1275:   VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::dot);
1276:   VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::tdot);
1277:   VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::norm);
1278:   VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::mdot);
1279:   VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::reciprocal);
1280:   VecSetOp_CUPM(shift, nullptr, VecSeq_T::shift);
1281:   VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1282:   VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1283:   VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template getlocalvector<PETSC_MEMORY_ACCESS_READ>);
1284:   VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template restorelocalvector<PETSC_MEMORY_ACCESS_READ>);
1285:   VecSetOp_CUPM(sum, nullptr, VecSeq_T::sum);
1286:   PetscFunctionReturn(PETSC_SUCCESS);
1287: }

1289: // Called from VecGetSubVector()
1290: template <device::cupm::DeviceType T, typename D>
1291: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1292: {
1293:   PetscFunctionBegin;
1294:   PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1295:   if (host_array) {
1296:     PetscCall(HostAllocateCheck_(dctx, v));
1297:     *host_array = VecIMPLCast(v)->array;
1298:   }
1299:   if (device_array) {
1300:     PetscCall(DeviceAllocateCheck_(dctx, v));
1301:     *device_array = VecCUPMCast(v)->array_d;
1302:   }
1303:   if (mask) *mask = v->offloadmask;
1304:   PetscFunctionReturn(PETSC_SUCCESS);
1305: }

1307: template <device::cupm::DeviceType T, typename D>
1308: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1309: {
1310:   PetscFunctionBegin;
1311:   if (const auto vcu = VecCUPMCast(v)) {
1312:     cupmStream_t stream;
1313:     // clang-format off
1314:     const auto   cntptrs = util::make_array(
1315:       std::ref(vcu->jmap1_d),
1316:       std::ref(vcu->perm1_d),
1317:       std::ref(vcu->imap2_d),
1318:       std::ref(vcu->jmap2_d),
1319:       std::ref(vcu->perm2_d),
1320:       std::ref(vcu->Cperm_d)
1321:     );
1322:     // clang-format on

1324:     PetscCall(GetHandlesFrom_(dctx, &stream));
1325:     for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1326:     for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1327:   }
1328:   PetscFunctionReturn(PETSC_SUCCESS);
1329: }

1331: template <device::cupm::DeviceType T, typename D>
1332: template <std::size_t NCount, std::size_t NScal>
1333: inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1334: {
1335:   const auto vimpl = VecIMPLCast(v);

1337:   PetscFunctionBegin;
1338:   PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1339:   // need to instantiate the private pointer if not already
1340:   PetscCall(VecCUPMAllocateCheck_(v));
1341:   {
1342:     const auto vcu = VecCUPMCast(v);
1343:     // clang-fomat off
1344:     const auto cntptrs = util::concat_array(util::make_array(make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1), make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)), extra_cntptrs);
1345:     // clang-format on
1346:     cupmStream_t stream;

1348:     PetscCall(GetHandlesFrom_(dctx, &stream));
1349:     // allocate
1350:     for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1351:     for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1352:     // copy
1353:     for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1354:     for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1355:   }
1356:   PetscFunctionReturn(PETSC_SUCCESS);
1357: }

1359:   #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1360:     using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1361:     friend name; \
1362:     /* introspection */ \
1363:     using name::VecCUPMCast; \
1364:     using name::VecIMPLCast; \
1365:     using name::VECIMPLCUPM; \
1366:     using name::VECSEQCUPM; \
1367:     using name::VECMPICUPM; \
1368:     using name::VecView_Debug; \
1369:     /* utility */ \
1370:     using typename name::Vec_CUPM; \
1371:     using name::UseCUPMHostAlloc; \
1372:     using name::GetHandles_; \
1373:     using name::GetHandlesFrom_; \
1374:     using name::VecCUPMAllocateCheck_; \
1375:     using name::VecIMPLAllocateCheck_; \
1376:     using name::HostAllocateCheck_; \
1377:     using name::DeviceAllocateCheck_; \
1378:     using name::CopyToDevice_; \
1379:     using name::CopyToHost_; \
1380:     using name::create; \
1381:     using name::destroy; \
1382:     using name::getarray; \
1383:     using name::restorearray; \
1384:     using name::getarrayandmemtype; \
1385:     using name::restorearrayandmemtype; \
1386:     using name::placearray; \
1387:     using name::replacearray; \
1388:     using name::resetarray; \
1389:     /* base functions */ \
1390:     using name::Create_CUPMBase; \
1391:     using name::Initialize_CUPMBase; \
1392:     using name::Duplicate_CUPMBase; \
1393:     using name::BindToCPU_CUPMBase; \
1394:     using name::Create_CUPM; \
1395:     using name::DeviceArrayRead; \
1396:     using name::DeviceArrayWrite; \
1397:     using name::DeviceArrayReadWrite; \
1398:     using name::HostArrayRead; \
1399:     using name::HostArrayWrite; \
1400:     using name::HostArrayReadWrite; \
1401:     using name::ResetPreallocationCOO_CUPMBase; \
1402:     using name::SetPreallocationCOO_CUPMBase; \
1403:     /* blas interface */ \
1404:     PETSC_CUPMBLAS_INHERIT_INTERFACE_TYPEDEFS_USING(cupmBlasInterface_t, Tp)

1406: } // namespace impl

1408: } // namespace cupm

1410: } // namespace vec

1412: } // namespace Petsc

1414: #endif // __cplusplus && PetscDefined(HAVE_DEVICE)

1416: #endif // PETSCVECCUPMIMPL_H