Actual source code: vecscattercusp.cu
petsc-3.5.4 2015-05-23
1: /*
2: Implements the various scatter operations on cusp vectors
3: */
5: #include <petscconf.h>
6: PETSC_CUDA_EXTERN_C_BEGIN
7: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: PETSC_CUDA_EXTERN_C_END
10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
12: #include <cuda_runtime.h>
16: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {
18: PetscCUSPIndices cci;
19: VecScatterCUSPIndices_StoS stos_scatter;
20: cudaError_t err = cudaSuccess;
21: cudaStream_t stream;
22: PetscInt *intVecGPU;
23: int device;
24: cudaDeviceProp props;
27: cci = new struct _p_PetscCUSPIndices;
28: stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;
30: /* create the "from" indices */
31: stos_scatter->fslots = 0;
32: stos_scatter->fromFirst = 0;
33: stos_scatter->fromStep = 0;
34: if (n) {
35: if (fslots) {
36: /* allocate GPU memory for the to-slots */
37: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
38: err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
40: /* assign the pointer to the struct */
41: stos_scatter->fslots = intVecGPU;
42: stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
43: } else if (fromStep) {
44: stos_scatter->fromFirst = fromFirst;
45: stos_scatter->fromStep = fromStep;
46: stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
47: }
48: }
50: /* create the "to" indices */
51: stos_scatter->tslots = 0;
52: stos_scatter->toFirst = 0;
53: stos_scatter->toStep = 0;
54: if (n) {
55: if (tslots) {
56: /* allocate GPU memory for the to-slots */
57: err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
58: err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);
60: /* assign the pointer to the struct */
61: stos_scatter->tslots = intVecGPU;
62: stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
63: } else if (toStep) {
64: stos_scatter->toFirst = toFirst;
65: stos_scatter->toStep = toStep;
66: stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
67: }
68: }
70: /* allocate the stream variable */
71: err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
72: stos_scatter->stream = stream;
74: /* the number of indices */
75: stos_scatter->n = n;
77: /* get the maximum number of coresident thread blocks */
78: cudaGetDevice(&device);
79: cudaGetDeviceProperties(&props, device);
80: stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
81: if (props.major>=3) {
82: stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
83: } else {
84: stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
85: }
87: /* assign the indices */
88: cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
89: cci->scatterType = VEC_SCATTER_CUSP_STOS;
90: *ci = cci;
91: return(0);
92: }
96: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
97: {
98: PetscCUSPIndices cci;
99: VecScatterCUSPIndices_PtoP ptop_scatter;
102: cci = new struct _p_PetscCUSPIndices;
103: ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;
105: /* this calculation assumes that the input indices are sorted */
106: ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
107: ptop_scatter->sendLowestIndex = sendIndices[0];
108: ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
109: ptop_scatter->recvLowestIndex = recvIndices[0];
111: /* assign indices */
112: cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
113: cci->scatterType = VEC_SCATTER_CUSP_PTOP;
115: *ci = cci;
116: return(0);
117: }
121: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
122: {
124: if (!(*ci)) return(0);
125: try {
126: if (ci) {
127: if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
128: delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
129: (*ci)->scatter = 0;
130: } else {
131: cudaError_t err = cudaSuccess;
132: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
133: if (stos_scatter->fslots) {
134: err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
135: stos_scatter->fslots = 0;
136: }
138: /* free the GPU memory for the to-slots */
139: if (stos_scatter->tslots) {
140: err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
141: stos_scatter->tslots = 0;
142: }
144: /* free the stream variable */
145: if (stos_scatter->stream) {
146: err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
147: stos_scatter->stream = 0;
148: }
149: delete stos_scatter;
150: (*ci)->scatter = 0;
151: }
152: delete *ci;
153: *ci = 0;
154: }
155: } catch(char *ex) {
156: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
157: }
158: return(0);
159: }
161: /* Insert operator */
162: class Insert {
163: public:
164: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
165: return a;
166: }
167: };
169: /* Add operator */
170: class Add {
171: public:
172: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
173: return a+b;
174: }
175: };
177: /* Add operator */
178: class Max {
179: public:
180: __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
181: #if !defined(PETSC_USE_COMPLEX)
182: return PetscMax(a,b);
183: #endif
184: }
185: };
187: /* Sequential general to sequential general GPU kernel */
188: template<class OPERATOR>
189: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
190: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
191: const int grid_size = gridDim.x * blockDim.x;
192: for (int i = tidx; i < n; i += grid_size) {
193: y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
194: }
195: }
197: /* Sequential general to sequential strided GPU kernel */
198: template<class OPERATOR>
199: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
200: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
201: const int grid_size = gridDim.x * blockDim.x;
202: for (int i = tidx; i < n; i += grid_size) {
203: y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
204: }
205: }
207: /* Sequential strided to sequential strided GPU kernel */
208: template<class OPERATOR>
209: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
210: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
211: const int grid_size = gridDim.x * blockDim.x;
212: for (int i = tidx; i < n; i += grid_size) {
213: y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
214: }
215: }
217: /* Sequential strided to sequential general GPU kernel */
218: template<class OPERATOR>
219: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
220: const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
221: const int grid_size = gridDim.x * blockDim.x;
222: for (int i = tidx; i < n; i += grid_size) {
223: y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
224: }
225: }
227: template<class OPERATOR>
228: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {
230: PetscInt nBlocks=0,nThreads=128;
231: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
233: nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
234: if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
235: nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
236: }
237: dim3 block(nThreads,1,1);
238: dim3 grid(nBlocks,1,1);
240: if (mode == SCATTER_FORWARD) {
241: if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
242: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
243: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
244: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
245: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
246: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
247: } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
248: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
249: }
250: } else {
251: if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
252: VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
253: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
254: VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
255: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
256: VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
257: } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
258: VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
259: }
260: }
261: }
265: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
266: {
267: PetscErrorCode ierr;
268: CUSPARRAY *xarray,*yarray;
269: VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
270: cudaError_t err = cudaSuccess;
273: VecCUSPAllocateCheck(x);
274: VecCUSPAllocateCheck(y);
275: VecCUSPGetArrayRead(x,&xarray);
276: VecCUSPGetArrayWrite(y,&yarray);
277: if (stos_scatter->n) {
278: if (addv == INSERT_VALUES)
279: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
280: else if (addv == ADD_VALUES)
281: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
282: #if !defined(PETSC_USE_COMPLEX)
283: else if (addv == MAX_VALUES)
284: VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
285: #endif
286: else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
287: err = cudaGetLastError();CHKERRCUSP((int)err);
288: err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
289: }
290: VecCUSPRestoreArrayRead(x,&xarray);
291: VecCUSPRestoreArrayWrite(y,&yarray);
292: return(0);
293: }