Actual source code: sfpack.c
petsc-master 2020-08-25
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4: #if defined(PETSC_HAVE_CUDA)
5: #include <cuda_runtime.h>
6: #endif
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */
16: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
17: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */
18: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0)
19: /* Ref MPI MAXLOC */
20: #define OP_XLOC(op,s,t) \
21: do { \
22: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23: else if (!((s).u op (t).u)) s = t; \
24: } while (0)
26: /* DEF_PackFunc - macro defining a Pack routine
28: Arguments of the macro:
29: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30: .BS Block size for vectorization. It is a factor of bsz.
31: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33: Arguments of the Pack routine:
34: +count Number of indices in idx[].
35: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36: .opt Per-pack optimization plan. NULL means no such plan.
37: .idx Indices of entries to packed.
38: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40: -packed Address of the packed data.
41: */
42: #define DEF_PackFunc(Type,BS,EQ) \
43: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44: { \
46: const Type *u = (const Type*)unpacked,*u2; \
47: Type *p = (Type*)packed,*p2; \
48: PetscInt i,j,k,X,Y,r,bs=link->bs; \
49: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
50: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
52: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
53: else if (opt) { /* has optimizations available */ \
54: p2 = p; \
55: for (r=0; r<opt->n; r++) { \
56: u2 = u + opt->start[r]*MBS; \
57: X = opt->X[r]; \
58: Y = opt->Y[r]; \
59: for (k=0; k<opt->dz[r]; k++) \
60: for (j=0; j<opt->dy[r]; j++) { \
61: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
62: p2 += opt->dx[r]*MBS; \
63: } \
64: } \
65: } else { \
66: for (i=0; i<count; i++) \
67: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
68: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
69: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
70: } \
71: return(0); \
72: }
74: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75: and inserts into a sparse array.
77: Arguments:
78: .Type Type of the data
79: .BS Block size for vectorization
80: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
82: Notes:
83: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84: */
85: #define DEF_UnpackFunc(Type,BS,EQ) \
86: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87: { \
89: Type *u = (Type*)unpacked,*u2; \
90: const Type *p = (const Type*)packed; \
91: PetscInt i,j,k,X,Y,r,bs=link->bs; \
92: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
93: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
95: if (!idx) { \
96: u += start*MBS; \
97: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
98: } else if (opt) { /* has optimizations available */ \
99: for (r=0; r<opt->n; r++) { \
100: u2 = u + opt->start[r]*MBS; \
101: X = opt->X[r]; \
102: Y = opt->Y[r]; \
103: for (k=0; k<opt->dz[r]; k++) \
104: for (j=0; j<opt->dy[r]; j++) { \
105: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
106: p += opt->dx[r]*MBS; \
107: } \
108: } \
109: } else { \
110: for (i=0; i<count; i++) \
111: for (j=0; j<M; j++) \
112: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
113: } \
114: return(0); \
115: }
117: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
119: Arguments:
120: +Opname Name of the Op, such as Add, Mult, LAND, etc.
121: .Type Type of the data
122: .BS Block size for vectorization
123: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
124: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125: .OpApply Macro defining Section 1.5 Writing Application Codes with PETSc of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126: */
127: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129: { \
130: Type *u = (Type*)unpacked,*u2; \
131: const Type *p = (const Type*)packed; \
132: PetscInt i,j,k,X,Y,r,bs=link->bs; \
133: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
134: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
136: if (!idx) { \
137: u += start*MBS; \
138: for (i=0; i<count; i++) \
139: for (j=0; j<M; j++) \
140: for (k=0; k<BS; k++) \
141: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
142: } else if (opt) { /* idx[] has patterns */ \
143: for (r=0; r<opt->n; r++) { \
144: u2 = u + opt->start[r]*MBS; \
145: X = opt->X[r]; \
146: Y = opt->Y[r]; \
147: for (k=0; k<opt->dz[r]; k++) \
148: for (j=0; j<opt->dy[r]; j++) { \
149: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
150: p += opt->dx[r]*MBS; \
151: } \
152: } \
153: } else { \
154: for (i=0; i<count; i++) \
155: for (j=0; j<M; j++) \
156: for (k=0; k<BS; k++) \
157: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
158: } \
159: return(0); \
160: }
162: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164: { \
165: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
166: PetscInt i,j,k,r,l,bs=link->bs; \
167: const PetscInt M = (EQ) ? 1 : bs/BS; \
168: const PetscInt MBS = M*BS; \
170: for (i=0; i<count; i++) { \
171: r = (!idx ? start+i : idx[i])*MBS; \
172: l = i*MBS; \
173: for (j=0; j<M; j++) \
174: for (k=0; k<BS; k++) { \
175: tmp = u[r+j*BS+k]; \
176: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
177: p[l+j*BS+k] = tmp; \
178: } \
179: } \
180: return(0); \
181: }
183: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185: { \
187: const Type *u = (const Type*)src; \
188: Type *v = (Type*)dst; \
189: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
190: const PetscInt M = (EQ) ? 1 : bs/BS; \
191: const PetscInt MBS = M*BS; \
193: if (!srcIdx) { /* src is contiguous */ \
194: u += srcStart*MBS; \
195: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
196: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
197: u += srcOpt->start[0]*MBS; \
198: v += dstStart*MBS; \
199: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
200: for (k=0; k<srcOpt->dz[0]; k++) \
201: for (j=0; j<srcOpt->dy[0]; j++) { \
202: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
203: v += srcOpt->dx[0]*MBS; \
204: } \
205: } else { /* all other cases */ \
206: for (i=0; i<count; i++) { \
207: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
208: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
209: for (j=0; j<M; j++) \
210: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
211: } \
212: } \
213: return(0); \
214: }
216: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218: { \
219: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
220: const Type *ldata = (const Type*)leafdata; \
221: PetscInt i,j,k,r,l,bs = link->bs; \
222: const PetscInt M = (EQ) ? 1 : bs/BS; \
223: const PetscInt MBS = M*BS; \
225: for (i=0; i<count; i++) { \
226: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
227: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
228: for (j=0; j<M; j++) \
229: for (k=0; k<BS; k++) { \
230: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
231: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
232: } \
233: } \
234: return(0); \
235: }
237: /* Pack, Unpack/Fetch ops */
238: #define DEF_Pack(Type,BS,EQ) \
239: DEF_PackFunc(Type,BS,EQ) \
240: DEF_UnpackFunc(Type,BS,EQ) \
241: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
242: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
243: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
244: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
245: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
246: }
248: /* Add, Mult ops */
249: #define DEF_Add(Type,BS,EQ) \
250: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
251: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
252: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
253: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
254: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
255: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
256: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
257: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
258: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
259: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
260: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
261: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
262: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
263: }
265: /* Max, Min ops */
266: #define DEF_Cmp(Type,BS,EQ) \
267: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
268: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
269: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
270: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
271: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
272: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
273: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
274: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
275: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
276: }
278: /* Logical ops.
279: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280: the compilation warning "empty macro arguments are undefined in ISO C90"
281: */
282: #define DEF_Log(Type,BS,EQ) \
283: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
284: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
285: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
286: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
287: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
288: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
289: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
290: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
291: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
292: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
293: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
294: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
295: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
296: }
298: /* Bitwise ops */
299: #define DEF_Bit(Type,BS,EQ) \
300: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
301: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
302: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
303: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
304: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
305: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
306: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
307: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
308: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
309: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
310: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
311: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
312: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
313: }
315: /* Maxloc, Minloc ops */
316: #define DEF_Xloc(Type,BS,EQ) \
317: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
318: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
319: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
320: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
321: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
322: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
323: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
324: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
325: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
326: }
328: #define DEF_IntegerType(Type,BS,EQ) \
329: DEF_Pack(Type,BS,EQ) \
330: DEF_Add(Type,BS,EQ) \
331: DEF_Cmp(Type,BS,EQ) \
332: DEF_Log(Type,BS,EQ) \
333: DEF_Bit(Type,BS,EQ) \
334: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
335: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
336: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
337: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
338: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
339: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
340: }
342: #define DEF_RealType(Type,BS,EQ) \
343: DEF_Pack(Type,BS,EQ) \
344: DEF_Add(Type,BS,EQ) \
345: DEF_Cmp(Type,BS,EQ) \
346: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
347: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
348: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
349: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
350: }
352: #if defined(PETSC_HAVE_COMPLEX)
353: #define DEF_ComplexType(Type,BS,EQ) \
354: DEF_Pack(Type,BS,EQ) \
355: DEF_Add(Type,BS,EQ) \
356: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
357: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
358: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
359: }
360: #endif
362: #define DEF_DumbType(Type,BS,EQ) \
363: DEF_Pack(Type,BS,EQ) \
364: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
365: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
366: }
368: /* Maxloc, Minloc */
369: #define DEF_PairType(Type,BS,EQ) \
370: DEF_Pack(Type,BS,EQ) \
371: DEF_Xloc(Type,BS,EQ) \
372: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
373: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
374: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
375: }
377: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
378: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
386: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387: DEF_IntegerType(int,1,1)
388: DEF_IntegerType(int,2,1)
389: DEF_IntegerType(int,4,1)
390: DEF_IntegerType(int,8,1)
391: DEF_IntegerType(int,1,0)
392: DEF_IntegerType(int,2,0)
393: DEF_IntegerType(int,4,0)
394: DEF_IntegerType(int,8,0)
395: #endif
397: /* The typedefs are used to get a typename without space that CPPJoin can handle */
398: typedef signed char SignedChar;
399: DEF_IntegerType(SignedChar,1,1)
400: DEF_IntegerType(SignedChar,2,1)
401: DEF_IntegerType(SignedChar,4,1)
402: DEF_IntegerType(SignedChar,8,1)
403: DEF_IntegerType(SignedChar,1,0)
404: DEF_IntegerType(SignedChar,2,0)
405: DEF_IntegerType(SignedChar,4,0)
406: DEF_IntegerType(SignedChar,8,0)
408: typedef unsigned char UnsignedChar;
409: DEF_IntegerType(UnsignedChar,1,1)
410: DEF_IntegerType(UnsignedChar,2,1)
411: DEF_IntegerType(UnsignedChar,4,1)
412: DEF_IntegerType(UnsignedChar,8,1)
413: DEF_IntegerType(UnsignedChar,1,0)
414: DEF_IntegerType(UnsignedChar,2,0)
415: DEF_IntegerType(UnsignedChar,4,0)
416: DEF_IntegerType(UnsignedChar,8,0)
418: DEF_RealType(PetscReal,1,1)
419: DEF_RealType(PetscReal,2,1)
420: DEF_RealType(PetscReal,4,1)
421: DEF_RealType(PetscReal,8,1)
422: DEF_RealType(PetscReal,1,0)
423: DEF_RealType(PetscReal,2,0)
424: DEF_RealType(PetscReal,4,0)
425: DEF_RealType(PetscReal,8,0)
427: #if defined(PETSC_HAVE_COMPLEX)
428: DEF_ComplexType(PetscComplex,1,1)
429: DEF_ComplexType(PetscComplex,2,1)
430: DEF_ComplexType(PetscComplex,4,1)
431: DEF_ComplexType(PetscComplex,8,1)
432: DEF_ComplexType(PetscComplex,1,0)
433: DEF_ComplexType(PetscComplex,2,0)
434: DEF_ComplexType(PetscComplex,4,0)
435: DEF_ComplexType(PetscComplex,8,0)
436: #endif
438: #define PairType(Type1,Type2) Type1##_##Type2
439: typedef struct {int u; int i;} PairType(int,int);
440: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441: DEF_PairType(PairType(int,int),1,1)
442: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
444: /* If we don't know the basic type, we treat it as a stream of chars or ints */
445: DEF_DumbType(char,1,1)
446: DEF_DumbType(char,2,1)
447: DEF_DumbType(char,4,1)
448: DEF_DumbType(char,1,0)
449: DEF_DumbType(char,2,0)
450: DEF_DumbType(char,4,0)
452: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453: DEF_DumbType(DumbInt,1,1)
454: DEF_DumbType(DumbInt,2,1)
455: DEF_DumbType(DumbInt,4,1)
456: DEF_DumbType(DumbInt,8,1)
457: DEF_DumbType(DumbInt,1,0)
458: DEF_DumbType(DumbInt,2,0)
459: DEF_DumbType(DumbInt,4,0)
460: DEF_DumbType(DumbInt,8,0)
462: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464: {
465: int ierr;
466: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467: MPI_Type_commit(newtype); if (ierr) return ierr;
468: return MPI_SUCCESS;
469: }
470: #endif
472: /*
473: The routine Creates a communication link for the given operation. It first looks up its link cache. If
474: there is a free & suitable one, it uses it. Otherwise it creates a new one.
476: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
477: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
478: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
479: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
481: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
483: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
485: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
486: need pack/unpack data.
487: */
488: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
489: {
490: PetscErrorCode ierr;
491: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
492: PetscInt i,j,k,nrootreqs,nleafreqs,nreqs;
493: PetscSFLink *p,link;
494: PetscSFDirection direction;
495: MPI_Request *reqs = NULL;
496: PetscBool match,rootdirect[2],leafdirect[2];
497: PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */
498: PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
501: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
503: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
504: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
505: if (sfop == PETSCSF_BCAST) {
506: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
507: leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
508: } else if (sfop == PETSCSF_REDUCE) {
509: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
510: rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
511: } else { /* PETSCSF_FETCH */
512: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
513: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
514: }
515: }
517: if (sf->use_gpu_aware_mpi) {
518: rootmtype_mpi = rootmtype;
519: leafmtype_mpi = leafmtype;
520: } else {
521: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
522: }
523: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
524: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
525: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
527: direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
528: nrootreqs = bas->nrootreqs;
529: nleafreqs = sf->nleafreqs;
531: /* Look for free links in cache */
532: for (p=&bas->avail; (link=*p); p=&link->next) {
533: MPIPetsc_Type_compare(unit,link->unit,&match);
534: if (match) {
535: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
536: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
537: */
538: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
539: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
540: for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
541: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
542: }
543: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
544: reqs = link->leafreqs[direction][leafmtype][1];
545: for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
546: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
547: }
548: *p = link->next; /* Remove from available list */
549: goto found;
550: }
551: }
553: PetscNew(&link);
554: PetscSFLinkSetUp_Host(sf,link,unit);
555: PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */
557: nreqs = (nrootreqs+nleafreqs)*8;
558: PetscMalloc1(nreqs,&link->reqs);
559: for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
561: for (i=0; i<2; i++) { /* Two communication directions */
562: for (j=0; j<2; j++) { /* Two memory types */
563: for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
564: link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
565: link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
566: }
567: }
568: }
570: found:
571: if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {PetscSFLinkSetUp_Device(sf,link,unit);}
573: /* Allocate buffers along root/leafdata */
574: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
575: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
576: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
577: if (bas->rootbuflen[i]) {
578: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
579: link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
580: } else { /* Have to have a separate rootbuf */
581: if (!link->rootbuf_alloc[i][rootmtype]) {
582: PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
583: }
584: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
585: }
586: }
588: if (sf->leafbuflen[i]) {
589: if (leafdirect[i]) {
590: link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
591: } else {
592: if (!link->leafbuf_alloc[i][leafmtype]) {
593: PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
594: }
595: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
596: }
597: }
598: }
600: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
601: if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
602: if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
603: PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
604: }
605: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
606: }
607: if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
608: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609: PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
610: }
611: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612: }
614: /* Set `current` state of the link. They may change between different SF invocations with the same link */
615: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
616: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
617: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
618: }
620: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
621: link->leafdata = leafdata;
622: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
623: link->rootdirect[i] = rootdirect[i];
624: link->leafdirect[i] = leafdirect[i];
625: }
626: link->rootdirect_mpi = rootdirect_mpi;
627: link->leafdirect_mpi = leafdirect_mpi;
628: link->rootmtype = rootmtype;
629: link->leafmtype = leafmtype;
630: link->rootmtype_mpi = rootmtype_mpi;
631: link->leafmtype_mpi = leafmtype_mpi;
633: link->next = bas->inuse;
634: bas->inuse = link;
635: *mylink = link;
636: return(0);
637: }
639: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
640: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
641: */
642: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
643: {
644: PetscErrorCode ierr;
645: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
646: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
647: const PetscInt *rootoffset,*leafoffset;
648: PetscMPIInt n;
649: MPI_Aint disp;
650: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
651: MPI_Datatype unit = link->unit;
652: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
653: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
656: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
657: if (sf->persistent) {
658: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
659: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
660: if (direction == PETSCSF_LEAF2../../../../../..) {
661: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
662: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
663: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
664: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
665: }
666: } else { /* PETSCSF_../../../../../..2LEAF */
667: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
668: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
669: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
670: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
671: }
672: }
673: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
674: }
676: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
677: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
678: if (direction == PETSCSF_LEAF2../../../../../..) {
679: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
680: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
681: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
682: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
683: }
684: } else { /* PETSCSF_../../../../../..2LEAF */
685: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
686: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
687: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
688: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
689: }
690: }
691: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
692: }
693: }
694: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
695: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
696: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
697: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
698: return(0);
699: }
702: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
703: {
704: PetscErrorCode ierr;
705: PetscSFLink link,*p;
706: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
709: /* Look for types in cache */
710: for (p=&bas->inuse; (link=*p); p=&link->next) {
711: PetscBool match;
712: MPIPetsc_Type_compare(unit,link->unit,&match);
713: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
714: switch (cmode) {
715: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
716: case PETSC_USE_POINTER: break;
717: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
718: }
719: *mylink = link;
720: return(0);
721: }
722: }
723: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
724: return(0);
725: }
727: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
728: {
729: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
732: (*link)->rootdata = NULL;
733: (*link)->leafdata = NULL;
734: (*link)->next = bas->avail;
735: bas->avail = *link;
736: *link = NULL;
737: return(0);
738: }
740: /* Destroy all links chained in 'avail' */
741: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
742: {
743: PetscErrorCode ierr;
744: PetscSFLink link = *avail,next;
745: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
746: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
749: for (; link; link=next) {
750: next = link->next;
751: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
752: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
753: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
754: }
755: PetscFree(link->reqs);
756: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
757: #if defined(PETSC_HAVE_CUDA)
758: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
759: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
760: if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
761: #endif
762: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
763: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
764: }
765: PetscFree(link);
766: }
767: *avail = NULL;
768: return(0);
769: }
771: /* Error out on unsupported overlapped communications */
772: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
773: {
774: PetscErrorCode ierr;
775: PetscSFLink link,*p;
776: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
777: PetscBool match;
780: if (!PetscDefined(USE_DEBUG)) return(0);
781: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
782: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
783: */
784: if (rootdata || leafdata) {
785: for (p=&bas->inuse; (link=*p); p=&link->next) {
786: MPIPetsc_Type_compare(unit,link->unit,&match);
787: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
788: }
789: }
790: return(0);
791: }
793: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
794: {
796: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
797: PetscBool is2Int,is2PetscInt;
798: PetscMPIInt ni,na,nd,combiner;
799: #if defined(PETSC_HAVE_COMPLEX)
800: PetscInt nPetscComplex=0;
801: #endif
804: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
805: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
806: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
807: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
808: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
809: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
810: #if defined(PETSC_HAVE_COMPLEX)
811: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
812: #endif
813: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
814: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
815: /* TODO: shaell we also handle Fortran MPI_2REAL? */
816: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
817: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
818: link->bs = 1; /* default */
820: if (is2Int) {
821: PackInit_PairType_int_int_1_1(link);
822: link->bs = 1;
823: link->unitbytes = 2*sizeof(int);
824: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
825: link->basicunit = MPI_2INT;
826: link->unit = MPI_2INT;
827: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
828: PackInit_PairType_PetscInt_PetscInt_1_1(link);
829: link->bs = 1;
830: link->unitbytes = 2*sizeof(PetscInt);
831: link->basicunit = MPIU_2INT;
832: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
833: link->unit = MPIU_2INT;
834: } else if (nPetscReal) {
835: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
836: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
837: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
838: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
839: link->bs = nPetscReal;
840: link->unitbytes = nPetscReal*sizeof(PetscReal);
841: link->basicunit = MPIU_REAL;
842: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
843: } else if (nPetscInt) {
844: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
845: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
846: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
847: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
848: link->bs = nPetscInt;
849: link->unitbytes = nPetscInt*sizeof(PetscInt);
850: link->basicunit = MPIU_INT;
851: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
852: #if defined(PETSC_USE_64BIT_INDICES)
853: } else if (nInt) {
854: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
855: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
856: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
857: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
858: link->bs = nInt;
859: link->unitbytes = nInt*sizeof(int);
860: link->basicunit = MPI_INT;
861: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
862: #endif
863: } else if (nSignedChar) {
864: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
865: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
866: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
867: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
868: link->bs = nSignedChar;
869: link->unitbytes = nSignedChar*sizeof(SignedChar);
870: link->basicunit = MPI_SIGNED_CHAR;
871: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
872: } else if (nUnsignedChar) {
873: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
874: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
875: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
876: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
877: link->bs = nUnsignedChar;
878: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
879: link->basicunit = MPI_UNSIGNED_CHAR;
880: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
881: #if defined(PETSC_HAVE_COMPLEX)
882: } else if (nPetscComplex) {
883: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
884: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
885: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
886: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
887: link->bs = nPetscComplex;
888: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
889: link->basicunit = MPIU_COMPLEX;
890: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
891: #endif
892: } else {
893: MPI_Aint lb,nbyte;
894: MPI_Type_get_extent(unit,&lb,&nbyte);
895: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
896: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
897: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
898: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
899: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
900: link->bs = nbyte;
901: link->unitbytes = nbyte;
902: link->basicunit = MPI_BYTE;
903: } else {
904: nInt = nbyte / sizeof(int);
905: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
906: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
907: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
908: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
909: link->bs = nInt;
910: link->unitbytes = nbyte;
911: link->basicunit = MPI_INT;
912: }
913: if (link->isbuiltin) link->unit = unit;
914: }
916: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
917: return(0);
918: }
920: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
921: {
923: *UnpackAndOp = NULL;
924: if (mtype == PETSC_MEMTYPE_HOST) {
925: if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
926: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
927: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
928: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
929: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
930: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
931: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
932: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
933: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
934: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
935: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
936: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
937: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
938: }
939: #if defined(PETSC_HAVE_CUDA)
940: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
941: if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
942: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
943: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
944: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
945: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
946: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
947: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
948: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
949: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
950: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
951: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
952: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
953: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
954: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
955: if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
956: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
957: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
958: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
959: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
960: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
961: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
962: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
963: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
964: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
965: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
966: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
967: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
968: }
969: #endif
970: return(0);
971: }
973: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
974: {
976: *ScatterAndOp = NULL;
977: if (mtype == PETSC_MEMTYPE_HOST) {
978: if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
979: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
980: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
981: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
982: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
983: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
984: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
985: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
986: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
987: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
988: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
989: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
990: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
991: }
992: #if defined(PETSC_HAVE_CUDA)
993: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
994: if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
995: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
996: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
997: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
998: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
999: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
1000: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
1001: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
1002: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
1003: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
1004: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
1005: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
1006: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
1007: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1008: if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
1009: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1010: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
1011: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1012: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1013: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
1014: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
1015: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
1016: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
1017: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
1018: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
1019: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
1020: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
1021: }
1022: #endif
1023: return(0);
1024: }
1026: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1027: {
1029: *FetchAndOp = NULL;
1030: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1031: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1032: #if defined(PETSC_HAVE_CUDA)
1033: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1034: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd;
1035: #endif
1036: return(0);
1037: }
1039: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1040: {
1042: *FetchAndOpLocal = NULL;
1043: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1044: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1045: #if defined(PETSC_HAVE_CUDA)
1046: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1047: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
1048: #endif
1049: return(0);
1050: }
1052: /*=============================================================================
1053: A set of helper routines for Pack/Unpack/Scatter on GPUs
1054: ============================================================================*/
1055: #if defined(PETSC_HAVE_CUDA)
1056: /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1057: make sure the data is ready for packing.
1058: */
1059: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1060: {
1062: if (sf->use_default_stream) return(0);
1063: if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1064: cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1065: }
1066: return(0);
1067: }
1069: /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1070: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1071: {
1072: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1075: /* Do nothing if we use stream aware mpi || has nothing for remote */
1076: if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) return(0);
1077: /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1078: if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1079: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1080: }
1081: return(0);
1082: }
1083: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1084: {
1086: /* See comments above */
1087: if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) return(0);
1088: if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1089: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1090: }
1091: return(0);
1092: }
1094: /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1095: does not know which stream the callers will use.
1096: */
1097: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1098: {
1099: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1100: PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1103: /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
1104: if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) return(0);
1106: /* If rootmtype is HOST or DEVICE:
1107: If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
1108: If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
1109: i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
1111: Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
1112: CPU thread might use the yet-to-be-updated rootdata pending in the stream.
1113: */
1114: if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1115: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1116: }
1117: return(0);
1118: }
1119: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1120: {
1121: PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1124: /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
1125: if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) return(0);
1126: if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1127: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1128: }
1129: return(0);
1130: }
1132: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1133: to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1134: */
1135: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1136: {
1138: cudaError_t cerr;
1139: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1142: if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1143: void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1144: void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1145: size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1146: if (device2host) {
1147: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1148: PetscLogGpuToCpu(count);
1149: } else {
1150: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1151: PetscLogCpuToGpu(count);
1152: }
1153: }
1154: return(0);
1155: }
1157: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1158: {
1160: cudaError_t cerr;
1163: if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1164: void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1165: void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1166: size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1167: if (device2host) {
1168: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1169: PetscLogGpuToCpu(count);
1170: } else {
1171: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1172: PetscLogCpuToGpu(count);
1173: }
1174: }
1175: return(0);
1176: }
1177: #else
1179: #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0
1180: #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0
1181: #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0
1182: #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0
1183: #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0
1184: #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1185: #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1187: #endif
1189: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1190: {
1192: PetscLogDouble flops;
1193: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1197: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1198: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1199: #if defined(PETSC_HAVE_CUDA)
1200: if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1201: #endif
1202: {PetscLogFlops(flops);}
1203: }
1204: return(0);
1205: }
1207: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1208: {
1209: PetscLogDouble flops;
1213: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1214: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1215: #if defined(PETSC_HAVE_CUDA)
1216: if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1217: #endif
1218: {PetscLogFlops(flops);}
1219: }
1220: return(0);
1221: }
1223: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1224: Input Arguments:
1225: +sf - The StarForest
1226: .link - The link
1227: .count - Number of entries to unpack
1228: .start - The first index, significent when indices=NULL
1229: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1230: .buf - A contiguous buffer to unpack from
1231: -op - Operation after unpack
1233: Output Arguments:
1234: .data - The data to unpack to
1235: */
1236: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1237: {
1239: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1240: {
1242: PetscInt i;
1243: PetscMPIInt n;
1244: if (indices) {
1245: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1246: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1247: */
1248: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
1249: } else {
1250: PetscMPIIntCast(count,&n);
1251: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
1252: }
1253: }
1254: #else
1255: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1256: #endif
1257: return(0);
1258: }
1260: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1261: {
1263: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1264: {
1266: PetscInt i,disp;
1267: if (!srcIdx) {
1268: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
1269: } else {
1270: for (i=0; i<count; i++) {
1271: disp = dstIdx? dstIdx[i] : dstStart + i;
1272: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1273: }
1274: }
1275: }
1276: #else
1277: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1278: #endif
1279: return(0);
1280: }
1282: /*=============================================================================
1283: Pack/Unpack/Fetch/Scatter routines
1284: ============================================================================*/
1286: /* Pack rootdata to rootbuf
1287: Input Arguments:
1288: + sf - The SF this packing works on.
1289: . link - It gives the memtype of the roots and also provides root buffer.
1290: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1291: - rootdata - Where to read the roots.
1293: Notes:
1294: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1295: in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1296: */
1297: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1298: {
1299: PetscErrorCode ierr;
1300: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1301: const PetscInt *rootindices = NULL;
1302: PetscInt count,start;
1303: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1304: PetscMemType rootmtype = link->rootmtype;
1305: PetscSFPackOpt opt = NULL;
1309: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1310: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1311: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1312: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1313: PetscSFLinkGetPack(link,rootmtype,&Pack);
1314: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1315: }
1316: if (scope == PETSCSF_REMOTE) {
1317: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1318: PetscSFLinkSyncStreamAfterPackRootData(sf,link);
1319: }
1320: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1321: return(0);
1322: }
1324: /* Pack leafdata to leafbuf */
1325: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1326: {
1327: PetscErrorCode ierr;
1328: const PetscInt *leafindices = NULL;
1329: PetscInt count,start;
1330: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1331: PetscMemType leafmtype = link->leafmtype;
1332: PetscSFPackOpt opt = NULL;
1335: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1336: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1337: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1338: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1339: PetscSFLinkGetPack(link,leafmtype,&Pack);
1340: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1341: }
1342: if (scope == PETSCSF_REMOTE) {
1343: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1344: PetscSFLinkSyncStreamAfterPackLeafData(sf,link);
1345: }
1346: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1347: return(0);
1348: }
1350: /* Unpack rootbuf to rootdata */
1351: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1352: {
1353: PetscErrorCode ierr;
1354: const PetscInt *rootindices = NULL;
1355: PetscInt count,start;
1356: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1357: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1358: PetscMemType rootmtype = link->rootmtype;
1359: PetscSFPackOpt opt = NULL;
1362: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1363: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1364: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1365: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1366: if (UnpackAndOp) {
1367: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1368: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1369: } else {
1370: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1371: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1372: }
1373: }
1374: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);}
1375: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1376: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1377: return(0);
1378: }
1380: /* Unpack leafbuf to leafdata */
1381: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1382: {
1383: PetscErrorCode ierr;
1384: const PetscInt *leafindices = NULL;
1385: PetscInt count,start;
1386: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1387: PetscMemType leafmtype = link->leafmtype;
1388: PetscSFPackOpt opt = NULL;
1391: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1392: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1393: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1394: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1395: if (UnpackAndOp) {
1396: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1397: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1398: } else {
1399: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1400: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1401: }
1402: }
1403: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);}
1404: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1405: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1406: return(0);
1407: }
1409: /* FetchAndOp rootdata with rootbuf */
1410: PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1411: {
1412: PetscErrorCode ierr;
1413: const PetscInt *rootindices = NULL;
1414: PetscInt count,start;
1415: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1416: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1417: PetscMemType rootmtype = link->rootmtype;
1418: PetscSFPackOpt opt = NULL;
1421: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1422: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1423: if (bas->rootbuflen[scope]) {
1424: /* Do FetchAndOp on rootdata with rootbuf */
1425: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);
1426: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1427: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1428: }
1429: if (scope == PETSCSF_REMOTE) {
1430: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
1431: PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);
1432: }
1433: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1434: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1435: return(0);
1436: }
1438: /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1439: PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1440: {
1441: PetscErrorCode ierr;
1442: const PetscInt *rootindices = NULL,*leafindices = NULL;
1443: PetscInt count,rootstart,leafstart;
1444: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1445: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1446: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1447: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1450: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1451: if (rootmtype != leafmtype) { /* Uncommon case */
1452: /* The local communication has to go through pack and unpack */
1453: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1454: PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);
1455: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1456: } else {
1457: PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);
1458: if (ScatterAndOp) {
1459: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1460: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1461: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1462: } else {
1463: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1464: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1465: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1466: }
1467: }
1468: return(0);
1469: }
1471: /* Reduce leafdata to rootdata locally */
1472: PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1473: {
1474: PetscErrorCode ierr;
1475: const PetscInt *rootindices = NULL,*leafindices = NULL;
1476: PetscInt count,rootstart,leafstart;
1477: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1478: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1479: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1480: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1483: if (!sf->leafbuflen[PETSCSF_LOCAL]) return(0);
1484: if (rootmtype != leafmtype) {
1485: /* The local communication has to go through pack and unpack */
1486: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1487: PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);
1488: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1489: } else {
1490: PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);
1491: if (ScatterAndOp) {
1492: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1493: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1494: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1495: } else {
1496: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1497: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1498: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1499: }
1500: }
1501: return(0);
1502: }
1504: /* Fetch rootdata to leafdata and leafupdate locally */
1505: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1506: {
1507: PetscErrorCode ierr;
1508: const PetscInt *rootindices = NULL,*leafindices = NULL;
1509: PetscInt count,rootstart,leafstart;
1510: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1511: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1512: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1513: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1516: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1517: if (rootmtype != leafmtype) {
1518: /* The local communication has to go through pack and unpack */
1519: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1520: } else {
1521: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1522: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1523: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1524: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1525: }
1526: return(0);
1527: }
1529: /*
1530: Create per-rank pack/unpack optimizations based on indice patterns
1532: Input Parameters:
1533: + n - Number of destination ranks
1534: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1535: - idx - [*] Array storing indices
1537: Output Parameters:
1538: + opt - Pack optimizations. NULL if no optimizations.
1539: */
1540: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1541: {
1543: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1544: PetscBool optimizable = PETSC_TRUE;
1545: PetscSFPackOpt opt;
1548: PetscMalloc1(1,&opt);
1549: PetscMalloc1(7*n+2,&opt->array);
1550: opt->n = opt->array[0] = n;
1551: opt->offset = opt->array + 1;
1552: opt->start = opt->array + n + 2;
1553: opt->dx = opt->array + 2*n + 2;
1554: opt->dy = opt->array + 3*n + 2;
1555: opt->dz = opt->array + 4*n + 2;
1556: opt->X = opt->array + 5*n + 2;
1557: opt->Y = opt->array + 6*n + 2;
1559: for (r=0; r<n; r++) { /* For each destination rank */
1560: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1561: p = offset[r];
1562: start = idx[p]; /* First index for this rank */
1563: p++;
1565: /* Search in X dimension */
1566: for (dx=1; dx<m; dx++,p++) {
1567: if (start+dx != idx[p]) break;
1568: }
1570: dydz = m/dx;
1571: X = dydz > 1 ? (idx[p]-start) : dx;
1572: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1573: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1574: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1575: for (i=0; i<dx; i++,p++) {
1576: if (start+X*dy+i != idx[p]) {
1577: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1578: else goto Z_dimension;
1579: }
1580: }
1581: }
1583: Z_dimension:
1584: dz = m/(dx*dy);
1585: Y = dz > 1 ? (idx[p]-start)/X : dy;
1586: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1587: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1588: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1589: for (j=0; j<dy; j++) {
1590: for (i=0; i<dx; i++,p++) {
1591: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1592: }
1593: }
1594: }
1595: opt->start[r] = start;
1596: opt->dx[r] = dx;
1597: opt->dy[r] = dy;
1598: opt->dz[r] = dz;
1599: opt->X[r] = X;
1600: opt->Y[r] = Y;
1601: }
1603: finish:
1604: /* If not optimizable, free arrays to save memory */
1605: if (!n || !optimizable) {
1606: PetscFree(opt->array);
1607: PetscFree(opt);
1608: *out = NULL;
1609: } else {
1610: opt->offset[0] = 0;
1611: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1612: *out = opt;
1613: }
1614: return(0);
1615: }
1617: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1618: {
1620: PetscSFPackOpt opt = *out;
1623: if (opt) {
1624: if (mtype == PETSC_MEMTYPE_HOST) {PetscFree(opt->array);}
1625: #if defined(PETSC_HAVE_CUDA)
1626: else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1627: #endif
1628: PetscFree(opt);
1629: *out = NULL;
1630: }
1631: return(0);
1632: }
1634: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1635: {
1637: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1638: PetscInt i,j;
1641: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1642: for (i=0; i<2; i++) { /* Set defaults */
1643: sf->leafstart[i] = 0;
1644: sf->leafcontig[i] = PETSC_TRUE;
1645: sf->leafdups[i] = PETSC_FALSE;
1646: bas->rootstart[i] = 0;
1647: bas->rootcontig[i] = PETSC_TRUE;
1648: bas->rootdups[i] = PETSC_FALSE;
1649: }
1651: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1652: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1654: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1655: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1657: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1658: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1659: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1660: }
1661: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1662: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1663: }
1665: /* If not, see if we can have per-rank optimizations by doing index analysis */
1666: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1667: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1669: /* Are root indices for self and remote contiguous? */
1670: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1671: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1673: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1674: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1676: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1677: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1678: }
1679: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1680: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1681: }
1683: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1684: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1686: #if defined(PETSC_HAVE_CUDA)
1687: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1692: #endif
1694: return(0);
1695: }
1697: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1698: {
1700: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1701: PetscInt i;
1704: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1705: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1706: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1707: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1708: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1709: }
1710: return(0);
1711: }