Actual source code: sfpackcuda.cu

petsc-master 2020-08-25
Report Typos and Errors
  1:  #include <../src/vec/is/sf/impls/basic/sfpack.h>
  2: #include <cuda_runtime.h>

  4: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
  5: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
  6: {
  7:   PetscInt        i,j,k,m,n,r;
  8:   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;

 10:   n      = opt[0];
 11:   offset = opt + 1;
 12:   start  = opt + n + 2;
 13:   dx     = opt + 2*n + 2;
 14:   dy     = opt + 3*n + 2;
 15:   X      = opt + 5*n + 2;
 16:   Y      = opt + 6*n + 2;
 17:   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
 18:   m = (tid - offset[r]);
 19:   k = m/(dx[r]*dy[r]);
 20:   j = (m - k*dx[r]*dy[r])/dx[r];
 21:   i = m - k*dx[r]*dy[r] - j*dx[r];

 23:   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
 24: }

 26: /*====================================================================================*/
 27: /*  Templated CUDA kernels for pack/unpack. The Op can be regular or atomic           */
 28: /*====================================================================================*/

 30: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
 31:    <Type> is PetscReal, which is the primitive type we operate on.
 32:    <bs>   is 16, which says <unit> contains 16 primitive types.
 33:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 34:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 36:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 37:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 38: */
 39: template<class Type,PetscInt BS,PetscInt EQ>
 40: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
 41: {
 42:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 43:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 44:   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
 45:   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 47:   for (; tid<count; tid += grid_size) {
 48:     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
 49:        opt == NULL && idx == NULL ==> the indices are contiguous;
 50:      */
 51:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 52:     s = tid*MBS;
 53:     for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
 54:   }
 55: }

 57: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 58: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
 59: {
 60:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 61:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 62:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 63:   Op              op;

 65:   for (; tid<count; tid += grid_size) {
 66:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 67:     s = tid*MBS;
 68:     for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
 69:   }
 70: }

 72: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 73: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
 74: {
 75:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
 76:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 77:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 78:   Op              op;

 80:   for (; tid<count; tid += grid_size) {
 81:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
 82:     l = tid*MBS;
 83:     for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
 84:   }
 85: }

 87: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 88: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
 89: {
 90:   PetscInt        i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 91:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 92:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 93:   Op              op;

 95:   for (; tid<count; tid += grid_size) {
 96:     if (!srcIdx) { /* src is either contiguous or 3D */
 97:       k = tid/(srcx*srcy);
 98:       j = (tid - k*srcx*srcy)/srcx;
 99:       i = tid - k*srcx*srcy - j*srcx;
100:       s = srcStart + k*srcX*srcY + j*srcX + i;
101:     } else {
102:       s = srcIdx[tid];
103:     }

105:     if (!dstIdx) { /* dst is either contiguous or 3D */
106:       k = tid/(dstx*dsty);
107:       j = (tid - k*dstx*dsty)/dstx;
108:       i = tid - k*dstx*dsty - j*dstx;
109:       t = dstStart + k*dstX*dstY + j*dstX + i;
110:     } else {
111:       t = dstIdx[tid];
112:     }

114:     s *= MBS;
115:     t *= MBS;
116:     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
117:   }
118: }

120: template<class Type,class Op,PetscInt BS,PetscInt EQ>
121: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
122: {
123:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
124:   const PetscInt  grid_size = gridDim.x * blockDim.x;
125:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
126:   Op              op;

128:   for (; tid<count; tid += grid_size) {
129:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
130:     l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
131:     for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
132:   }
133: }

135: /*====================================================================================*/
136: /*                             Regular operations on device                           */
137: /*====================================================================================*/
138: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
139: template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
140: template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
141: template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
142: template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
143: template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
144: template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
145: template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
146: template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
147: template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
148: template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
149: template<typename Type> struct Minloc {
150:   __device__ Type operator() (Type& x,Type y) const {
151:     Type old = x;
152:     if (y.a < x.a) x = y;
153:     else if (y.a == x.a) x.b = min(x.b,y.b);
154:     return old;
155:   }
156: };
157: template<typename Type> struct Maxloc {
158:   __device__ Type operator() (Type& x,Type y) const {
159:     Type old = x;
160:     if (y.a > x.a) x = y;
161:     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
162:     return old;
163:   }
164: };

166: /*====================================================================================*/
167: /*                             Atomic operations on device                            */
168: /*====================================================================================*/

170: /*
171:   Atomic Insert (exchange) operations

173:   CUDA C Programming Guide V10.1 Chapter B.12.1.3:

175:   int atomicExch(int* address, int val);
176:   unsigned int atomicExch(unsigned int* address, unsigned int val);
177:   unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
178:   float atomicExch(float* address, float val);

180:   reads the 32-bit or 64-bit word old located at the address address in global or shared
181:   memory and stores val back to memory at the same address. These two operations are
182:   performed in one atomic transaction. The function returns old.

184:   PETSc notes:

186:   It may be useful in PetscSFFetchAndOp with op = MPIU_REPLACE.

188:   VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
189:   atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
190:   be atomic itself.

192:   With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
193:   insertion is atomic. Hope no user codes rely on that.
194: */
195: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((unsigned long long int*)address,__double_as_longlong(val)));}

197: #if defined(PETSC_USE_64BIT_INDICES)
198: __device__ static PetscInt atomicExch(PetscInt* address,PetscInt val) {return (PetscInt)(atomicExch((unsigned long long int*)address,(unsigned long long int)val));}
199: #endif

201: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};

203: #if defined(PETSC_HAVE_COMPLEX)
204: #if defined(PETSC_USE_REAL_DOUBLE)
205: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
206: template<> struct AtomicInsert<PetscComplex> {
207:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
208:     PetscComplex         old, *z = &old;
209:     double               *xp = (double*)&x,*yp = (double*)&y;
210:     AtomicInsert<double> op;
211:     z[0] = op(xp[0],yp[0]);
212:     z[1] = op(xp[1],yp[1]);
213:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
214:   }
215: };
216: #elif defined(PETSC_USE_REAL_SINGLE)
217: template<> struct AtomicInsert<PetscComplex> {
218:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
219:     double               *xp = (double*)&x,*yp = (double*)&y;
220:     AtomicInsert<double> op;
221:     return op(xp[0],yp[0]);
222:   }
223: };
224: #endif
225: #endif

227: /*
228:   Atomic add operations

230:   CUDA C Programming Guide V10.1 Chapter B.12.1.1:

232:   int atomicAdd(int* address, int val);
233:   unsigned int atomicAdd(unsigned int* address,unsigned int val);
234:   unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
235:   float atomicAdd(float* address, float val);
236:   double atomicAdd(double* address, double val);
237:   __half2 atomicAdd(__half2 *address, __half2 val);
238:   __half atomicAdd(__half *address, __half val);

240:   reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
241:   and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
242:   function returns old.

244:   The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
245:   The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
246:   The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
247:   higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
248:   the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
249:   The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
250: */

252: #if defined(PETSC_USE_64BIT_INDICES)
253: __device__ static PetscInt atomicAdd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAdd((unsigned long long int*)address,(unsigned long long int)val);}
254: #endif

256: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};

258: template<> struct AtomicAdd<double> {
259:   __device__ double operator() (double& x,double y) const {
260: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
261:     return atomicAdd(&x,y);
262: #else
263:     double                 *address = &x, val = y;
264:     unsigned long long int *address_as_ull = (unsigned long long int*)address;
265:     unsigned long long int old = *address_as_ull, assumed;
266:     do {
267:       assumed = old;
268:       old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
269:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
270:     } while (assumed != old);
271:     return __longlong_as_double(old);
272: #endif
273:   }
274: };

276: template<> struct AtomicAdd<float> {
277:   __device__ float operator() (float& x,float y) const {
278: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
279:     return atomicAdd(&x,y);
280: #else
281:     float *address = &x, val = y;
282:     int   *address_as_int = (int*)address;
283:     int   old = *address_as_int, assumed;
284:     do {
285:       assumed = old;
286:       old     = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
287:       /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
288:     } while (assumed != old);
289:     return __int_as_float(old);
290: #endif
291:   }
292: };

294: #if defined(PETSC_HAVE_COMPLEX)
295: template<> struct AtomicAdd<PetscComplex> {
296:  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
297:   PetscComplex         old, *z = &old;
298:   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
299:   AtomicAdd<PetscReal> op;
300:   z[0] = op(xp[0],yp[0]);
301:   z[1] = op(xp[1],yp[1]);
302:   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
303:  }
304: };
305: #endif

307: /*
308:   Atomic Mult operations:

310:   CUDA has no atomicMult at all, so we build our own with atomicCAS
311:  */
312: #if defined(PETSC_USE_REAL_DOUBLE)
313: __device__ static double atomicMult(double* address, double val)
314: {
315:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
316:   unsigned long long int old = *address_as_ull, assumed;
317:   do {
318:     assumed = old;
319:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
320:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
321:   } while (assumed != old);
322:   return __longlong_as_double(old);
323: }
324: #elif defined(PETSC_USE_REAL_SINGLE)
325: __device__ static float atomicMult(float* address,float val)
326: {
327:   int *address_as_int = (int*)(address);
328:   int old = *address_as_int, assumed;
329:   do {
330:     assumed  = old;
331:     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
332:   } while (assumed != old);
333:   return __int_as_float(old);
334: }
335: #endif

337: __device__ static int atomicMult(int* address,int val)
338: {
339:   int *address_as_int = (int*)(address);
340:   int old = *address_as_int, assumed;
341:   do {
342:     assumed = old;
343:     old     = atomicCAS(address_as_int, assumed, val*assumed);
344:   } while (assumed != old);
345:   return (int)old;
346: }

348: #if defined(PETSC_USE_64BIT_INDICES)
349: __device__ static int atomicMult(PetscInt* address,PetscInt val)
350: {
351:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
352:   unsigned long long int old = *address_as_ull, assumed;
353:   do {
354:     assumed = old;
355:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val*(PetscInt)assumed));
356:   } while (assumed != old);
357:   return (PetscInt)old;
358: }
359: #endif

361: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};

363: /*
364:   Atomic Min/Max operations

366:   CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:

368:   int atomicMin(int* address, int val);
369:   unsigned int atomicMin(unsigned int* address,unsigned int val);
370:   unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);

372:   reads the 32-bit or 64-bit word old located at the address address in global or shared
373:   memory, computes the minimum of old and val, and stores the result back to memory
374:   at the same address. These three operations are performed in one atomic transaction.
375:   The function returns old.
376:   The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.

378:   atomicMax() is similar.
379:  */

381: #if defined(PETSC_USE_REAL_DOUBLE)
382: __device__ static double atomicMin(double* address, double val)
383: {
384:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
385:   unsigned long long int old = *address_as_ull, assumed;
386:   do {
387:     assumed = old;
388:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
389:   } while (assumed != old);
390:   return __longlong_as_double(old);
391: }

393: __device__ static double atomicMax(double* address, double val)
394: {
395:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
396:   unsigned long long int old = *address_as_ull, assumed;
397:   do {
398:     assumed  = old;
399:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
400:   } while (assumed != old);
401:   return __longlong_as_double(old);
402: }
403: #elif defined(PETSC_USE_REAL_SINGLE)
404: __device__ static float atomicMin(float* address,float val)
405: {
406:   int *address_as_int = (int*)(address);
407:   int old = *address_as_int, assumed;
408:   do {
409:     assumed = old;
410:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
411:   } while (assumed != old);
412:   return __int_as_float(old);
413: }

415: __device__ static float atomicMax(float* address,float val)
416: {
417:   int *address_as_int = (int*)(address);
418:   int old = *address_as_int, assumed;
419:   do {
420:     assumed = old;
421:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
422:   } while (assumed != old);
423:   return __int_as_float(old);
424: }
425: #endif

427: /*
428:   atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
429:   atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
430:   This causes compilation errors with pgi compilers and 64-bit indices:
431:       error: function "atomicMin(long long *, long long)" has already been defined

433:   So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
434: */
435: #if defined(PETSC_USE_64BIT_INDICES) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
436: __device__ static PetscInt atomicMin(PetscInt* address,PetscInt val)
437: {
438:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
439:   unsigned long long int old = *address_as_ull, assumed;
440:   do {
441:     assumed = old;
442:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMin(val,(PetscInt)assumed)));
443:   } while (assumed != old);
444:   return (PetscInt)old;
445: }

447: __device__ static PetscInt atomicMax(PetscInt* address,PetscInt val)
448: {
449:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
450:   unsigned long long int old = *address_as_ull, assumed;
451:   do {
452:     assumed = old;
453:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(PetscMax(val,(PetscInt)assumed)));
454:   } while (assumed != old);
455:   return (PetscInt)old;
456: }
457: #endif

459: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
460: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};

462: /*
463:   Atomic bitwise operations

465:   CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:

467:   int atomicAnd(int* address, int val);
468:   unsigned int atomicAnd(unsigned int* address,unsigned int val);
469:   unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);

471:   reads the 32-bit or 64-bit word old located at the address address in global or shared
472:   memory, computes (old & val), and stores the result back to memory at the same
473:   address. These three operations are performed in one atomic transaction.
474:   The function returns old.

476:   The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.

478:   atomicOr() and atomicXor are similar.
479: */

481: #if defined(PETSC_USE_64BIT_INDICES)
482: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin(PetscInt* address,PetscInt val) */
483: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val)
484: {
485:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
486:   unsigned long long int old = *address_as_ull, assumed;
487:   do {
488:     assumed = old;
489:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val & (PetscInt)assumed));
490:   } while (assumed != old);
491:   return (PetscInt)old;
492: }
493: __device__ static PetscInt atomicOr(PetscInt* address,PetscInt val)
494: {
495:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
496:   unsigned long long int old = *address_as_ull, assumed;
497:   do {
498:     assumed = old;
499:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val | (PetscInt)assumed));
500:   } while (assumed != old);
501:   return (PetscInt)old;
502: }

504: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val)
505: {
506:   unsigned long long int *address_as_ull = (unsigned long long int*)(address);
507:   unsigned long long int old = *address_as_ull, assumed;
508:   do {
509:     assumed = old;
510:     old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(val ^ (PetscInt)assumed));
511:   } while (assumed != old);
512:   return (PetscInt)old;
513: }
514: #else
515: /*
516:  See also comments at atomicMin(PetscInt* address,PetscInt val)
517: __device__ static PetscInt atomicAnd(PetscInt* address,PetscInt val) {return (PetscInt)atomicAnd((unsigned long long int*)address,(unsigned long long int)val);}
518: __device__ static PetscInt atomicOr (PetscInt* address,PetscInt val) {return (PetscInt)atomicOr ((unsigned long long int*)address,(unsigned long long int)val);}
519: __device__ static PetscInt atomicXor(PetscInt* address,PetscInt val) {return (PetscInt)atomicXor((unsigned long long int*)address,(unsigned long long int)val);}
520: */
521: #endif
522: #endif

524: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
525: template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
526: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};

528: /*
529:   Atomic logical operations:

531:   CUDA has no atomic logical operations at all. We support them on integer types.
532: */

534: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
535:    which is what we want since we only support 32-bit and 64-bit integers.
536:  */
537: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;

539: template<typename Type,class Op>
540: struct AtomicLogical<Type,Op,4> {
541:   __device__ Type operator()(Type& x,Type y) const {
542:     int *address_as_int = (int*)(&x);
543:     int old = *address_as_int, assumed;
544:     Op op;
545:     do {
546:       assumed = old;
547:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
548:     } while (assumed != old);
549:     return (Type)old;
550:   }
551: };

553: template<typename Type,class Op>
554: struct AtomicLogical<Type,Op,8> {
555:   __device__ Type operator()(Type& x,Type y) const {
556:     unsigned long long int *address_as_ull = (unsigned long long int*)(&x);
557:     unsigned long long int old = *address_as_ull, assumed;
558:     Op op;
559:     do {
560:       assumed = old;
561:       old     = atomicCAS(address_as_ull, assumed, (unsigned long long int)(op((Type)assumed,y)));
562:     } while (assumed != old);
563:     return (Type)old;
564:   }
565: };

567: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
568: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
569: template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
570: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};

572: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
573: template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
574: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};

576: /*====================================================================================*/
577: /*  Wrapper functions of cuda kernels. Function pointers are stored in 'link'         */
578: /*====================================================================================*/
579: template<typename Type,PetscInt BS,PetscInt EQ>
580: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
581: {
582:   cudaError_t        cerr;
583:   PetscInt           nthreads=256;
584:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
585:   const PetscInt     *iarray=opt ? opt->array : NULL;

588:   if (!count) return(0);
589:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
590:   d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
591:   cerr = cudaGetLastError();CHKERRCUDA(cerr);
592:   return(0);
593: }

595: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
596: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
597: {
598:   cudaError_t        cerr;
599:   PetscInt           nthreads=256;
600:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
601:   const PetscInt     *iarray=opt ? opt->array : NULL;

604:   if (!count) return(0);
605:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
606:   d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
607:   cerr = cudaGetLastError();CHKERRCUDA(cerr);
608:   return(0);
609: }

611: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
612: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
613: {
614:   cudaError_t        cerr;
615:   PetscInt           nthreads=256;
616:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
617:   const PetscInt     *iarray=opt ? opt->array : NULL;

620:   if (!count) return(0);
621:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
622:   d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
623:   cerr = cudaGetLastError();CHKERRCUDA(cerr);
624:   return(0);
625: }

627: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
628: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
629: {
630:   cudaError_t        cerr;
631:   PetscInt           nthreads=256;
632:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
633:   PetscInt           srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;

636:   if (!count) return(0);
637:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);

639:   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
640:   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
641:   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}

643:   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
644:   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}

646:   d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
647:   cerr = cudaGetLastError();CHKERRCUDA(cerr);
648:   return(0);
649: }

651: /* Specialization for Insert since we may use cudaMemcpyAsync */
652: template<typename Type,PetscInt BS,PetscInt EQ>
653: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
654: {
655:   PetscErrorCode    ierr;
656:   cudaError_t       cerr;

659:   if (!count) return(0);
660:   /*src and dst are contiguous */
661:   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
662:     cerr = cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
663:   } else {
664:     ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
665:   }
666:   return(0);
667: }

669: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
670: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
671: {
672:   cudaError_t       cerr;
673:   PetscInt          nthreads=256;
674:   PetscInt          nblocks=(count+nthreads-1)/nthreads;
675:   const PetscInt    *rarray = rootopt ? rootopt->array : NULL;
676:   const PetscInt    *larray = leafopt ? leafopt->array : NULL;

679:   if (!count) return(0);
680:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
681:   d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
682:   cerr = cudaGetLastError();CHKERRCUDA(cerr);
683:   return(0);
684: }

686: /*====================================================================================*/
687: /*  Init various types and instantiate pack/unpack function pointers                  */
688: /*====================================================================================*/
689: template<typename Type,PetscInt BS,PetscInt EQ>
690: static void PackInit_RealType(PetscSFLink link)
691: {
692:   /* Pack/unpack for remote communication */
693:   link->d_Pack              = Pack<Type,BS,EQ>;
694:   link->d_UnpackAndInsert   = UnpackAndOp     <Type,Insert<Type>      ,BS,EQ>;
695:   link->d_UnpackAndAdd      = UnpackAndOp     <Type,Add<Type>         ,BS,EQ>;
696:   link->d_UnpackAndMult     = UnpackAndOp     <Type,Mult<Type>        ,BS,EQ>;
697:   link->d_UnpackAndMin      = UnpackAndOp     <Type,Min<Type>         ,BS,EQ>;
698:   link->d_UnpackAndMax      = UnpackAndOp     <Type,Max<Type>         ,BS,EQ>;
699:   link->d_FetchAndAdd       = FetchAndOp      <Type,Add<Type>         ,BS,EQ>;

701:   /* Scatter for local communication */
702:   link->d_ScatterAndInsert  = ScatterAndInsert<Type                   ,BS,EQ>; /* Has special optimizations */
703:   link->d_ScatterAndAdd     = ScatterAndOp    <Type,Add<Type>         ,BS,EQ>;
704:   link->d_ScatterAndMult    = ScatterAndOp    <Type,Mult<Type>        ,BS,EQ>;
705:   link->d_ScatterAndMin     = ScatterAndOp    <Type,Min<Type>         ,BS,EQ>;
706:   link->d_ScatterAndMax     = ScatterAndOp    <Type,Max<Type>         ,BS,EQ>;
707:   link->d_FetchAndAddLocal  = FetchAndOpLocal <Type,Add <Type>        ,BS,EQ>;

709:   /* Atomic versions when there are data-race possibilities */
710:   link->da_UnpackAndInsert  = UnpackAndOp     <Type,AtomicInsert<Type>,BS,EQ>;
711:   link->da_UnpackAndAdd     = UnpackAndOp     <Type,AtomicAdd<Type>   ,BS,EQ>;
712:   link->da_UnpackAndMult    = UnpackAndOp     <Type,AtomicMult<Type>  ,BS,EQ>;
713:   link->da_UnpackAndMin     = UnpackAndOp     <Type,AtomicMin<Type>   ,BS,EQ>;
714:   link->da_UnpackAndMax     = UnpackAndOp     <Type,AtomicMax<Type>   ,BS,EQ>;
715:   link->da_FetchAndAdd      = FetchAndOp      <Type,AtomicAdd<Type>   ,BS,EQ>;

717:   link->da_ScatterAndInsert = ScatterAndOp    <Type,AtomicInsert<Type>,BS,EQ>;
718:   link->da_ScatterAndAdd    = ScatterAndOp    <Type,AtomicAdd<Type>   ,BS,EQ>;
719:   link->da_ScatterAndMult   = ScatterAndOp    <Type,AtomicMult<Type>  ,BS,EQ>;
720:   link->da_ScatterAndMin    = ScatterAndOp    <Type,AtomicMin<Type>   ,BS,EQ>;
721:   link->da_ScatterAndMax    = ScatterAndOp    <Type,AtomicMax<Type>   ,BS,EQ>;
722:   link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type>   ,BS,EQ>;
723: }

725: /* Have this templated class to specialize for char integers */
726: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
727: struct PackInit_IntegerType_Atomic {
728:   static void Init(PetscSFLink link) {
729:     link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
730:     link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
731:     link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
732:     link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
733:     link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
734:     link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
735:     link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
736:     link->da_UnpackAndLXOR    = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
737:     link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
738:     link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
739:     link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
740:     link->da_FetchAndAdd      = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;

742:     link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
743:     link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
744:     link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
745:     link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
746:     link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
747:     link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
748:     link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
749:     link->da_ScatterAndLXOR   = ScatterAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
750:     link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
751:     link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
752:     link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
753:     link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
754:   }
755: };

757: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
758: template<typename Type,PetscInt BS,PetscInt EQ>
759: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
760:   static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
761: };

763: template<typename Type,PetscInt BS,PetscInt EQ>
764: static void PackInit_IntegerType(PetscSFLink link)
765: {
766:   link->d_Pack            = Pack<Type,BS,EQ>;
767:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
768:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
769:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
770:   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
771:   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
772:   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
773:   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
774:   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
775:   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
776:   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
777:   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;
778:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

780:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
781:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
782:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
783:   link->d_ScatterAndMin    = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
784:   link->d_ScatterAndMax    = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
785:   link->d_ScatterAndLAND   = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
786:   link->d_ScatterAndLOR    = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
787:   link->d_ScatterAndLXOR   = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
788:   link->d_ScatterAndBAND   = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
789:   link->d_ScatterAndBOR    = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
790:   link->d_ScatterAndBXOR   = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
791:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
792:   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
793: }

795: #if defined(PETSC_HAVE_COMPLEX)
796: template<typename Type,PetscInt BS,PetscInt EQ>
797: static void PackInit_ComplexType(PetscSFLink link)
798: {
799:   link->d_Pack             = Pack<Type,BS,EQ>;
800:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
801:   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
802:   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
803:   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

805:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
806:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
807:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
808:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;

810:   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
811:   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
812:   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
813:   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */

815:   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
816:   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
817: }
818: #endif

820: typedef signed char                      SignedChar;
821: typedef unsigned char                    UnsignedChar;
822: typedef struct {int a;      int b;     } PairInt;
823: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;

825: template<typename Type>
826: static void PackInit_PairType(PetscSFLink link)
827: {
828:   link->d_Pack            = Pack<Type,1,1>;
829:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
830:   link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
831:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;

833:   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
834:   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
835:   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
836:   /* Atomics for pair types are not implemented yet */
837: }

839: template<typename Type,PetscInt BS,PetscInt EQ>
840: static void PackInit_DumbType(PetscSFLink link)
841: {
842:   link->d_Pack             = Pack<Type,BS,EQ>;
843:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
844:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
845:   /* Atomics for dumb types are not implemented yet */
846: }

848: /*====================================================================================*/
849: /*                Main driver to init MPI datatype on device                          */
850: /*====================================================================================*/

852: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
853: PetscErrorCode PetscSFLinkSetUp_Device(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
854: {
856:   cudaError_t    err;
857:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
858:   PetscBool      is2Int,is2PetscInt;
859: #if defined(PETSC_HAVE_COMPLEX)
860:   PetscInt       nPetscComplex=0;
861: #endif

864:   if (link->deviceinited) return(0);
865:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
866:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
867:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
868:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
869:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
870:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
871: #if defined(PETSC_HAVE_COMPLEX)
872:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
873: #endif
874:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
875:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);

877:   if (is2Int) {
878:     PackInit_PairType<PairInt>(link);
879:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
880:     PackInit_PairType<PairPetscInt>(link);
881:   } else if (nPetscReal) {
882:     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
883:     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
884:     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
885:     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
886:   } else if (nPetscInt) {
887:     if      (nPetscInt == 8) PackInit_IntegerType<PetscInt,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<PetscInt,8,0>(link);
888:     else if (nPetscInt == 4) PackInit_IntegerType<PetscInt,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<PetscInt,4,0>(link);
889:     else if (nPetscInt == 2) PackInit_IntegerType<PetscInt,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<PetscInt,2,0>(link);
890:     else if (nPetscInt == 1) PackInit_IntegerType<PetscInt,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<PetscInt,1,0>(link);
891: #if defined(PETSC_USE_64BIT_INDICES)
892:   } else if (nInt) {
893:     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
894:     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
895:     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
896:     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
897: #endif
898:   } else if (nSignedChar) {
899:     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
900:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
901:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
902:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
903:   }  else if (nUnsignedChar) {
904:     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
905:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
906:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
907:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
908: #if defined(PETSC_HAVE_COMPLEX)
909:   } else if (nPetscComplex) {
910:     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
911:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
912:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
913:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
914: #endif
915:   } else {
916:     MPI_Aint lb,nbyte;
917:     MPI_Type_get_extent(unit,&lb,&nbyte);
918:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
919:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
920:       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
921:       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
922:       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
923:     } else {
924:       nInt = nbyte / sizeof(int);
925:       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
926:       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
927:       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
928:       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
929:     }
930:   }

932:   if (!sf->use_default_stream) {err = cudaStreamCreate(&link->stream);CHKERRCUDA(err);}
933:   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
934:     int                   device;
935:     struct cudaDeviceProp props;
936:     err = cudaGetDevice(&device);CHKERRCUDA(err);
937:     err = cudaGetDeviceProperties(&props,device);CHKERRCUDA(err);
938:     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
939:   }
940:   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
941:   link->deviceinited             = PETSC_TRUE;
942:   return(0);
943: }