Actual source code: sfimpl.h
petsc-master 2020-08-25
1: #if !defined(PETSCSFIMPL_H)
2: #define PETSCSFIMPL_H
4: #include <petscsf.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petscviewer.h>
8: #if defined(PETSC_HAVE_CUDA)
9: #include <cuda_runtime.h>
10: #include <petsc/private/cudavecimpl.h>
11: #endif
13: #if defined(PETSC_HAVE_HIP)
14: #include <hip/hip_runtime.h>
15: #endif
17: PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
18: PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
19: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
20: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
21: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpBegin;
22: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpEnd;
23: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
24: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
25: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
26: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
27: PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
28: PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
29: PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
30: PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
31: PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
32: PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
34: typedef enum {PETSCSF_../../..2LEAF=0, PETSCSF_LEAF2../../..} PetscSFDirection;
35: typedef enum {PETSCSF_BCAST=0, PETSCSF_REDUCE, PETSCSF_FETCH} PetscSFOperation;
36: typedef enum {PETSC_MEMTYPE_HOST=0, PETSC_MEMTYPE_DEVICE} PetscMemType;
38: struct _PetscSFOps {
39: PetscErrorCode (*Reset)(PetscSF);
40: PetscErrorCode (*Destroy)(PetscSF);
41: PetscErrorCode (*SetUp)(PetscSF);
42: PetscErrorCode (*SetFromOptions)(PetscOptionItems*,PetscSF);
43: PetscErrorCode (*View)(PetscSF,PetscViewer);
44: PetscErrorCode (*Duplicate)(PetscSF,PetscSFDuplicateOption,PetscSF);
45: PetscErrorCode (*BcastAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
46: PetscErrorCode (*BcastAndOpEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
47: PetscErrorCode (*ReduceBegin) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
48: PetscErrorCode (*ReduceEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
49: PetscErrorCode (*FetchAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,void*,PetscMemType,const void*,void*,MPI_Op);
50: PetscErrorCode (*FetchAndOpEnd) (PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op);
51: PetscErrorCode (*BcastToZero) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*); /* For interal use only */
52: PetscErrorCode (*GetRootRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**,const PetscInt**);
53: PetscErrorCode (*GetLeafRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
54: PetscErrorCode (*CreateLocalSF)(PetscSF,PetscSF*);
55: PetscErrorCode (*GetGraph)(PetscSF,PetscInt*,PetscInt*,const PetscInt**,const PetscSFNode**);
56: PetscErrorCode (*CreateEmbeddedSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
57: PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
58: };
60: typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
62: struct _p_PetscSF {
63: PETSCHEADER(struct _PetscSFOps);
64: PetscInt nroots; /* Number of root vertices on current process (candidates for incoming edges) */
65: PetscInt nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
66: PetscInt *mine; /* Location of leaves in leafdata arrays provided to the communication routines */
67: PetscInt *mine_alloc;
68: PetscInt minleaf,maxleaf;
69: PetscSFNode *remote; /* Remote references to roots for each local leaf */
70: PetscSFNode *remote_alloc;
71: PetscInt nranks; /* Number of ranks owning roots connected to my leaves */
72: PetscInt ndranks; /* Number of ranks in distinguished group holding roots connected to my leaves */
73: PetscMPIInt *ranks; /* List of ranks referenced by "remote" */
74: PetscInt *roffset; /* Array of length nranks+1, offset in rmine/rremote for each rank */
75: PetscInt *rmine; /* Concatenated array holding local indices referencing each remote rank */
76: PetscInt *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */
78: /* Some results useful in packing by analyzing rmine[] */
79: PetscInt leafbuflen[2]; /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
80: PetscBool leafcontig[2]; /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
81: PetscInt leafstart[2]; /* ... leafstart[0] and leafstart[1] respectively */
82: PetscSFPackOpt leafpackopt[2]; /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
83: PetscSFPackOpt leafpackopt_d[2];/* Copy of leafpackopt_d[] on device if needed */
84: PetscBool leafdups[2]; /* Indices in rmine[] for self(0)/remote(1) communication have dups? TRUE implies theads working on them in parallel may have data race. */
86: PetscInt nleafreqs; /* Number of MPI reqests for leaves */
87: PetscInt *rremote; /* Concatenated array holding remote indices referenced for each remote rank */
88: PetscBool degreeknown; /* The degree is currently known, do not have to recompute */
89: PetscInt *degree; /* Degree of each of my root vertices */
90: PetscInt *degreetmp; /* Temporary local array for computing degree */
91: PetscBool rankorder; /* Sort ranks for gather and scatter operations */
92: MPI_Group ingroup; /* Group of processes connected to my roots */
93: MPI_Group outgroup; /* Group of processes connected to my leaves */
94: PetscSF multi; /* Internal graph used to implement gather and scatter operations */
95: PetscBool graphset; /* Flag indicating that the graph has been set, required before calling communication routines */
96: PetscBool setupcalled; /* Type and communication structures have been set up */
97: PetscSFPattern pattern; /* Pattern of the graph */
98: PetscBool persistent; /* Does this SF use MPI persistent requests for communication */
99: PetscLayout map; /* Layout of leaves over all processes when building a patterned graph */
100: PetscBool use_default_stream; /* If true, SF assumes root/leafdata is on the default stream upon input and will also leave them there upon output */
101: PetscBool use_gpu_aware_mpi; /* If true, SF assumes it can pass GPU pointers to MPI */
102: PetscBool use_stream_aware_mpi;/* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
103: #if defined(PETSC_HAVE_CUDA)
104: PetscInt maxResidentThreadsPerGPU;
105: #endif
106: void *data; /* Pointer to implementation */
107: };
109: PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
110: PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);
112: PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF,PetscSF*);
113: PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF,MPI_Datatype,const void*,void*);
115: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype,MPI_Datatype*,PetscBool*);
116: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype,MPI_Datatype,PetscBool*);
117: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype,MPI_Datatype,PetscInt*);
119: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
120: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Iscatter(a,b,c,d,e,f,g,h,req)
121: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Iscatterv(a,b,c,d,e,f,g,h,i,req)
122: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Igather(a,b,c,d,e,f,g,h,req)
123: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Igatherv(a,b,c,d,e,f,g,h,i,req)
124: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Iallgather(a,b,c,d,e,f,g,req)
125: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Iallgatherv(a,b,c,d,e,f,g,h,req)
126: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Ialltoall(a,b,c,d,e,f,g,req)
127: #else
128: /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
129: to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
130: */
131: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Scatter(a,b,c,d,e,f,g,h)
132: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Scatterv(a,b,c,d,e,f,g,h,i)
133: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Gather(a,b,c,d,e,f,g,h)
134: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Gatherv(a,b,c,d,e,f,g,h,i)
135: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Allgather(a,b,c,d,e,f,g)
136: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Allgatherv(a,b,c,d,e,f,g,h)
137: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Alltoall(a,b,c,d,e,f,g)
138: #endif
140: PETSC_STATIC_INLINE PetscErrorCode PetscGetMemType(const void *data,PetscMemType *type)
141: {
144: *type = PETSC_MEMTYPE_HOST;
145: #if defined(PETSC_HAVE_CUDA)
146: if (PetscCUDAInitialized && data) {
147: cudaError_t cerr;
148: struct cudaPointerAttributes attr;
149: enum cudaMemoryType mtype;
150: cerr = cudaPointerGetAttributes(&attr,data); /* Do not check error since before CUDA 11.0, passing a host pointer returns cudaErrorInvalidValue */
151: cudaGetLastError(); /* Reset the last error */
152: #if (CUDART_VERSION < 10000)
153: mtype = attr.memoryType;
154: #else
155: mtype = attr.type;
156: #endif
157: if (cerr == cudaSuccess && mtype == cudaMemoryTypeDevice) {
158: *type = PETSC_MEMTYPE_DEVICE;
159: return(0);
160: }
161: }
162: #endif
164: #if defined(PETSC_HAVE_HIP)
165: if (PetscHIPInitialized && data) {
166: hipError_t cerr;
167: struct hipPointerAttribute_t attr;
168: enum hipMemoryType mtype;
169: cerr = hipPointerGetAttributes(&attr,data);
170: hipGetLastError(); /* Reset the last error */
171: mtype = attr.memoryType;
172: if (cerr == hipSuccess && mtype == hipMemoryTypeDevice) {
173: *type = PETSC_MEMTYPE_DEVICE;
174: return(0);
175: }
176: }
177: #endif
178: return(0);
179: }
181: PETSC_STATIC_INLINE PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)
182: {
184: if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode PetscMalloc(size,ptr);}
185: #if defined(PETSC_HAVE_CUDA)
186: else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMalloc(ptr,size);CHKERRCUDA(err);}
187: #elif defined(PETSC_HAVE_HIP)
188: else if (mtype == PETSC_MEMTYPE_DEVICE) {hipError_t err = hipMalloc(ptr,size);CHKERRQ(err==hipSuccess?0:PETSC_ERR_LIB);}
189: #endif
190: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
191: return(0);
192: }
194: PETSC_STATIC_INLINE PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)
195: {
197: if (mtype == PETSC_MEMTYPE_HOST) {PetscErrorCode PetscFree(ptr);}
198: #if defined(PETSC_HAVE_CUDA)
199: else if (mtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaFree(ptr);CHKERRCUDA(err);}
200: #elif defined(PETSC_HAVE_HIP)
201: else if (mtype == PETSC_MEMTYPE_DEVICE) {hipError_t err = hipFree(ptr);CHKERRQ(err==hipSuccess?0:PETSC_ERR_LIB);}
202: #endif
203: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
204: return(0);
205: }
207: /* Free memory and set ptr to NULL when succeeded */
208: #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))
210: #endif