Actual source code: vecscattercusp.cu

petsc-3.5.4 2015-05-23
Report Typos and Errors
  1: /*
  2:    Implements the various scatter operations on cusp vectors
  3: */

  5: #include <petscconf.h>
  6: PETSC_CUDA_EXTERN_C_BEGIN
  7: #include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 12: #include <cuda_runtime.h>

 16: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {

 18:   PetscCUSPIndices           cci;
 19:   VecScatterCUSPIndices_StoS stos_scatter;
 20:   cudaError_t                err = cudaSuccess;
 21:   cudaStream_t               stream;
 22:   PetscInt                   *intVecGPU;
 23:   int                        device;
 24:   cudaDeviceProp             props;

 27:   cci = new struct _p_PetscCUSPIndices;
 28:   stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;

 30:   /* create the "from" indices */
 31:   stos_scatter->fslots = 0;
 32:   stos_scatter->fromFirst = 0;
 33:   stos_scatter->fromStep = 0;
 34:   if (n) {
 35:     if (fslots) {
 36:       /* allocate GPU memory for the to-slots */
 37:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 38:       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 40:       /* assign the pointer to the struct */
 41:       stos_scatter->fslots = intVecGPU;
 42:       stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
 43:     } else if (fromStep) {
 44:       stos_scatter->fromFirst = fromFirst;
 45:       stos_scatter->fromStep = fromStep;
 46:       stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
 47:     }
 48:   }

 50:   /* create the "to" indices */
 51:   stos_scatter->tslots = 0;
 52:   stos_scatter->toFirst = 0;
 53:   stos_scatter->toStep = 0;
 54:   if (n) {
 55:     if (tslots) {
 56:       /* allocate GPU memory for the to-slots */
 57:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 58:       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 60:       /* assign the pointer to the struct */
 61:       stos_scatter->tslots = intVecGPU;
 62:       stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
 63:     } else if (toStep) {
 64:       stos_scatter->toFirst = toFirst;
 65:       stos_scatter->toStep = toStep;
 66:       stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
 67:     }
 68:   }

 70:   /* allocate the stream variable */
 71:   err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
 72:   stos_scatter->stream = stream;

 74:   /* the number of indices */
 75:   stos_scatter->n = n;

 77:   /* get the maximum number of coresident thread blocks */
 78:   cudaGetDevice(&device);
 79:   cudaGetDeviceProperties(&props, device);
 80:   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
 81:   if (props.major>=3) {
 82:     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
 83:   } else {
 84:     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
 85:   }

 87:   /* assign the indices */
 88:   cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
 89:   cci->scatterType = VEC_SCATTER_CUSP_STOS;
 90:   *ci = cci;
 91:   return(0);
 92: }

 96: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
 97: {
 98:   PetscCUSPIndices           cci;
 99:   VecScatterCUSPIndices_PtoP ptop_scatter;

102:   cci = new struct _p_PetscCUSPIndices;
103:   ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;

105:   /* this calculation assumes that the input indices are sorted */
106:   ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
107:   ptop_scatter->sendLowestIndex = sendIndices[0];
108:   ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
109:   ptop_scatter->recvLowestIndex = recvIndices[0];

111:   /* assign indices */
112:   cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
113:   cci->scatterType = VEC_SCATTER_CUSP_PTOP;

115:   *ci = cci;
116:   return(0);
117: }

121: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
122: {
124:   if (!(*ci)) return(0);
125:   try {
126:     if (ci) {
127:       if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
128:         delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
129:         (*ci)->scatter = 0;
130:       } else {
131:         cudaError_t err = cudaSuccess;
132:         VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
133:         if (stos_scatter->fslots) {
134:           err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
135:           stos_scatter->fslots = 0;
136:         }

138:         /* free the GPU memory for the to-slots */
139:         if (stos_scatter->tslots) {
140:           err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
141:           stos_scatter->tslots = 0;
142:         }

144:         /* free the stream variable */
145:         if (stos_scatter->stream) {
146:           err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
147:           stos_scatter->stream = 0;
148:         }
149:         delete stos_scatter;
150:         (*ci)->scatter = 0;
151:       }
152:       delete *ci;
153:       *ci = 0;
154:     }
155:   } catch(char *ex) {
156:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
157:   }
158:   return(0);
159: }

161: /* Insert operator */
162: class Insert {
163:  public:
164:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
165:     return a;
166:   }
167: };

169: /* Add operator */
170: class Add {
171:  public:
172:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
173:     return a+b;
174:   }
175: };

177: /* Add operator */
178: class Max {
179:  public:
180:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
181: #if !defined(PETSC_USE_COMPLEX)
182:     return PetscMax(a,b);
183: #endif
184:   }
185: };

187: /* Sequential general to sequential general GPU kernel */
188: template<class OPERATOR>
189: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
190:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
191:   const int grid_size = gridDim.x * blockDim.x;
192:   for (int i = tidx; i < n; i += grid_size) {
193:     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
194:   }
195: }

197: /* Sequential general to sequential strided GPU kernel */
198: template<class OPERATOR>
199: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
200:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
201:   const int grid_size = gridDim.x * blockDim.x;
202:   for (int i = tidx; i < n; i += grid_size) {
203:     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
204:   }
205: }

207: /* Sequential strided to sequential strided GPU kernel */
208: template<class OPERATOR>
209: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
210:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
211:   const int grid_size = gridDim.x * blockDim.x;
212:   for (int i = tidx; i < n; i += grid_size) {
213:     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
214:   }
215: }

217: /* Sequential strided to sequential general GPU kernel */
218: template<class OPERATOR>
219: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
220:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
221:   const int grid_size = gridDim.x * blockDim.x;
222:   for (int i = tidx; i < n; i += grid_size) {
223:     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
224:   }
225: }

227: template<class OPERATOR>
228: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {

230:   PetscInt                   nBlocks=0,nThreads=128;
231:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;

233:   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
234:   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
235:     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
236:   }
237:   dim3 block(nThreads,1,1);
238:   dim3 grid(nBlocks,1,1);

240:   if (mode == SCATTER_FORWARD) {
241:     if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
242:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
243:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
244:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
245:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
246:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
247:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
248:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
249:     }
250:   } else {
251:     if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
252:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
253:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
254:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
255:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
256:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
257:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
258:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
259:     }
260:   }
261: }

265: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
266: {
267:   PetscErrorCode             ierr;
268:   CUSPARRAY                  *xarray,*yarray;
269:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
270:   cudaError_t                err = cudaSuccess;

273:   VecCUSPAllocateCheck(x);
274:   VecCUSPAllocateCheck(y);
275:   VecCUSPGetArrayRead(x,&xarray);
276:   VecCUSPGetArrayWrite(y,&yarray);
277:   if (stos_scatter->n) {
278:     if (addv == INSERT_VALUES)
279:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
280:     else if (addv == ADD_VALUES)
281:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
282: #if !defined(PETSC_USE_COMPLEX)
283:     else if (addv == MAX_VALUES)
284:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
285: #endif
286:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287:     err = cudaGetLastError();CHKERRCUSP((int)err);
288:     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
289:   }
290:   VecCUSPRestoreArrayRead(x,&xarray);
291:   VecCUSPRestoreArrayWrite(y,&yarray);
292:   return(0);
293: }