Actual source code: veccuda2.cu
petsc-master 2020-08-25
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
18: /*
19: Allocates space for the vector array on the GPU if it does not exist.
20: Does NOT change the PetscCUDAFlag for the vector
21: Does NOT zero the CUDA array
23: */
24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
25: {
27: cudaError_t err;
28: Vec_CUDA *veccuda;
29: PetscBool option_set;
32: if (!v->spptr) {
33: PetscReal pinned_memory_min;
34: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
35: veccuda = (Vec_CUDA*)v->spptr;
36: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37: veccuda->GPUarray = veccuda->GPUarray_allocated;
38: veccuda->stream = 0; /* using default stream */
39: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->offloadmask = PETSC_OFFLOAD_CPU;
42: } else {
43: v->offloadmask = PETSC_OFFLOAD_GPU;
44: }
45: }
46: pinned_memory_min = 0;
48: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
51: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
52: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53: PetscOptionsEnd();
54: }
55: return(0);
56: }
58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
59: PetscErrorCode VecCUDACopyToGPU(Vec v)
60: {
62: cudaError_t err;
63: Vec_CUDA *veccuda;
64: PetscScalar *varray;
68: VecCUDAAllocateCheck(v);
69: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
71: veccuda = (Vec_CUDA*)v->spptr;
72: varray = veccuda->GPUarray;
73: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
75: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
76: v->offloadmask = PETSC_OFFLOAD_BOTH;
77: }
78: return(0);
79: }
81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
82: {
83: PetscScalar *varray;
85: cudaError_t err;
86: PetscScalar *cpuPtr, *gpuPtr;
87: Vec_Seq *s;
88: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
89: PetscInt lowestIndex,n;
93: VecCUDAAllocateCheck(v);
94: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95: s = (Vec_Seq*)v->data;
96: if (mode & SCATTER_REVERSE) {
97: lowestIndex = ptop_scatter->sendLowestIndex;
98: n = ptop_scatter->ns;
99: } else {
100: lowestIndex = ptop_scatter->recvLowestIndex;
101: n = ptop_scatter->nr;
102: }
104: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106: gpuPtr = varray + lowestIndex;
107: cpuPtr = s->array + lowestIndex;
109: /* Note : this code copies the smallest contiguous chunk of data
110: containing ALL of the indices */
111: err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112: PetscLogCpuToGpu(n*sizeof(PetscScalar));
114: /* Set the buffer states */
115: v->offloadmask = PETSC_OFFLOAD_BOTH;
116: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117: }
118: return(0);
119: }
122: /*
123: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
124: */
125: PetscErrorCode VecCUDACopyFromGPU(Vec v)
126: {
128: cudaError_t err;
129: Vec_CUDA *veccuda;
130: PetscScalar *varray;
134: VecCUDAAllocateCheckHost(v);
135: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
136: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
137: veccuda = (Vec_CUDA*)v->spptr;
138: varray = veccuda->GPUarray;
139: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
140: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
141: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
142: v->offloadmask = PETSC_OFFLOAD_BOTH;
143: }
144: return(0);
145: }
147: /* Note that this function only copies *some* of the values up from the GPU to CPU,
148: which means that we need recombine the data at some point before using any of the standard functions.
149: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
150: where you have to always call in pairs
151: */
152: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
153: {
154: const PetscScalar *varray, *gpuPtr;
155: PetscErrorCode ierr;
156: cudaError_t err;
157: PetscScalar *cpuPtr;
158: Vec_Seq *s;
159: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
160: PetscInt lowestIndex,n;
164: VecCUDAAllocateCheckHost(v);
165: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
166: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
167: if (mode & SCATTER_REVERSE) {
168: lowestIndex = ptop_scatter->recvLowestIndex;
169: n = ptop_scatter->nr;
170: } else {
171: lowestIndex = ptop_scatter->sendLowestIndex;
172: n = ptop_scatter->ns;
173: }
175: varray=((Vec_CUDA*)v->spptr)->GPUarray;
176: s = (Vec_Seq*)v->data;
177: gpuPtr = varray + lowestIndex;
178: cpuPtr = s->array + lowestIndex;
180: /* Note : this code copies the smallest contiguous chunk of data
181: containing ALL of the indices */
182: err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
183: PetscLogGpuToCpu(n*sizeof(PetscScalar));
185: VecCUDARestoreArrayRead(v,&varray);
186: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
187: }
188: return(0);
189: }
191: /*MC
192: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
194: Options Database Keys:
195: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
197: Level: beginner
199: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
200: M*/
202: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
203: {
204: const PetscScalar *xarray;
205: PetscScalar *yarray;
206: PetscErrorCode ierr;
207: PetscBLASInt one=1,bn;
208: PetscScalar sone=1.0;
209: cublasHandle_t cublasv2handle;
210: cublasStatus_t cberr;
211: cudaError_t err;
214: PetscCUBLASGetHandle(&cublasv2handle);
215: PetscBLASIntCast(yin->map->n,&bn);
216: VecCUDAGetArrayRead(xin,&xarray);
217: VecCUDAGetArray(yin,&yarray);
218: PetscLogGpuTimeBegin();
219: if (alpha == (PetscScalar)0.0) {
220: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
221: } else if (alpha == (PetscScalar)1.0) {
222: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
223: PetscLogGpuFlops(1.0*yin->map->n);
224: } else {
225: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
226: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
227: PetscLogGpuFlops(2.0*yin->map->n);
228: }
229: err = WaitForCUDA();CHKERRCUDA(err);
230: PetscLogGpuTimeEnd();
231: VecCUDARestoreArrayRead(xin,&xarray);
232: VecCUDARestoreArray(yin,&yarray);
233: return(0);
234: }
236: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
237: {
238: const PetscScalar *xarray;
239: PetscScalar *yarray;
240: PetscErrorCode ierr;
241: PetscBLASInt one=1,bn;
242: cublasHandle_t cublasv2handle;
243: cublasStatus_t cberr;
244: PetscBool yiscuda,xiscuda;
245: cudaError_t err;
248: if (alpha == (PetscScalar)0.0) return(0);
249: PetscCUBLASGetHandle(&cublasv2handle);
250: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
251: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
252: if (xiscuda && yiscuda) {
253: PetscBLASIntCast(yin->map->n,&bn);
254: VecCUDAGetArrayRead(xin,&xarray);
255: VecCUDAGetArray(yin,&yarray);
256: PetscLogGpuTimeBegin();
257: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
258: err = WaitForCUDA();CHKERRCUDA(err);
259: PetscLogGpuTimeEnd();
260: VecCUDARestoreArrayRead(xin,&xarray);
261: VecCUDARestoreArray(yin,&yarray);
262: PetscLogGpuFlops(2.0*yin->map->n);
263: } else {
264: VecAXPY_Seq(yin,alpha,xin);
265: }
266: return(0);
267: }
269: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
270: {
271: PetscInt n = xin->map->n;
272: const PetscScalar *xarray=NULL,*yarray=NULL;
273: PetscScalar *warray=NULL;
274: thrust::device_ptr<const PetscScalar> xptr,yptr;
275: thrust::device_ptr<PetscScalar> wptr;
276: PetscErrorCode ierr;
277: cudaError_t err;
280: VecCUDAGetArrayWrite(win,&warray);
281: VecCUDAGetArrayRead(xin,&xarray);
282: VecCUDAGetArrayRead(yin,&yarray);
283: PetscLogGpuTimeBegin();
284: try {
285: wptr = thrust::device_pointer_cast(warray);
286: xptr = thrust::device_pointer_cast(xarray);
287: yptr = thrust::device_pointer_cast(yarray);
288: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
289: err = WaitForCUDA();CHKERRCUDA(err);
290: } catch (char *ex) {
291: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
292: }
293: PetscLogGpuTimeEnd();
294: PetscLogGpuFlops(n);
295: VecCUDARestoreArrayRead(xin,&xarray);
296: VecCUDARestoreArrayRead(yin,&yarray);
297: VecCUDARestoreArrayWrite(win,&warray);
298: return(0);
299: }
301: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
302: {
303: const PetscScalar *xarray=NULL,*yarray=NULL;
304: PetscScalar *warray=NULL;
305: PetscErrorCode ierr;
306: PetscBLASInt one=1,bn;
307: cublasHandle_t cublasv2handle;
308: cublasStatus_t cberr;
309: cudaError_t err;
312: PetscCUBLASGetHandle(&cublasv2handle);
313: PetscBLASIntCast(win->map->n,&bn);
314: if (alpha == (PetscScalar)0.0) {
315: VecCopy_SeqCUDA(yin,win);
316: } else {
317: VecCUDAGetArrayRead(xin,&xarray);
318: VecCUDAGetArrayRead(yin,&yarray);
319: VecCUDAGetArrayWrite(win,&warray);
320: PetscLogGpuTimeBegin();
321: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
322: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
323: err = WaitForCUDA();CHKERRCUDA(err);
324: PetscLogGpuTimeEnd();
325: PetscLogGpuFlops(2*win->map->n);
326: VecCUDARestoreArrayRead(xin,&xarray);
327: VecCUDARestoreArrayRead(yin,&yarray);
328: VecCUDARestoreArrayWrite(win,&warray);
329: }
330: return(0);
331: }
333: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
334: {
336: cudaError_t err;
337: PetscInt n = xin->map->n,j,j_rem;
338: PetscScalar alpha0,alpha1,alpha2,alpha3;
341: PetscLogGpuFlops(nv*2.0*n);
342: switch (j_rem=nv&0x3) {
343: case 3:
344: alpha0 = alpha[0];
345: alpha1 = alpha[1];
346: alpha2 = alpha[2];
347: alpha += 3;
348: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
349: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
350: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
351: y += 3;
352: break;
353: case 2:
354: alpha0 = alpha[0];
355: alpha1 = alpha[1];
356: alpha +=2;
357: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
358: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
359: y +=2;
360: break;
361: case 1:
362: alpha0 = *alpha++;
363: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
364: y +=1;
365: break;
366: }
367: for (j=j_rem; j<nv; j+=4) {
368: alpha0 = alpha[0];
369: alpha1 = alpha[1];
370: alpha2 = alpha[2];
371: alpha3 = alpha[3];
372: alpha += 4;
373: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
374: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
375: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
376: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
377: y += 4;
378: }
379: err = WaitForCUDA();CHKERRCUDA(err);
380: return(0);
381: }
383: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
384: {
385: const PetscScalar *xarray,*yarray;
386: PetscErrorCode ierr;
387: PetscBLASInt one=1,bn;
388: cublasHandle_t cublasv2handle;
389: cublasStatus_t cerr;
392: PetscCUBLASGetHandle(&cublasv2handle);
393: PetscBLASIntCast(yin->map->n,&bn);
394: VecCUDAGetArrayRead(xin,&xarray);
395: VecCUDAGetArrayRead(yin,&yarray);
396: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
397: PetscLogGpuTimeBegin();
398: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
399: PetscLogGpuTimeEnd();
400: if (xin->map->n >0) {
401: PetscLogGpuFlops(2.0*xin->map->n-1);
402: }
403: VecCUDARestoreArrayRead(xin,&xarray);
404: VecCUDARestoreArrayRead(yin,&yarray);
405: return(0);
406: }
408: //
409: // CUDA kernels for MDot to follow
410: //
412: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
413: #define MDOT_WORKGROUP_SIZE 128
414: #define MDOT_WORKGROUP_NUM 128
416: #if !defined(PETSC_USE_COMPLEX)
417: // M = 2:
418: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
419: PetscInt size, PetscScalar *group_results)
420: {
421: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
422: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
423: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
424: PetscInt vec_start_index = blockIdx.x * entries_per_group;
425: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
427: PetscScalar entry_x = 0;
428: PetscScalar group_sum0 = 0;
429: PetscScalar group_sum1 = 0;
430: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
431: entry_x = x[i]; // load only once from global memory!
432: group_sum0 += entry_x * y0[i];
433: group_sum1 += entry_x * y1[i];
434: }
435: tmp_buffer[threadIdx.x] = group_sum0;
436: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
438: // parallel reduction
439: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
440: __syncthreads();
441: if (threadIdx.x < stride) {
442: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
443: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
444: }
445: }
447: // write result of group to group_results
448: if (threadIdx.x == 0) {
449: group_results[blockIdx.x] = tmp_buffer[0];
450: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
451: }
452: }
454: // M = 3:
455: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
456: PetscInt size, PetscScalar *group_results)
457: {
458: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
459: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
460: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
461: PetscInt vec_start_index = blockIdx.x * entries_per_group;
462: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
464: PetscScalar entry_x = 0;
465: PetscScalar group_sum0 = 0;
466: PetscScalar group_sum1 = 0;
467: PetscScalar group_sum2 = 0;
468: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
469: entry_x = x[i]; // load only once from global memory!
470: group_sum0 += entry_x * y0[i];
471: group_sum1 += entry_x * y1[i];
472: group_sum2 += entry_x * y2[i];
473: }
474: tmp_buffer[threadIdx.x] = group_sum0;
475: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
476: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
478: // parallel reduction
479: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
480: __syncthreads();
481: if (threadIdx.x < stride) {
482: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
483: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
484: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
485: }
486: }
488: // write result of group to group_results
489: if (threadIdx.x == 0) {
490: group_results[blockIdx.x ] = tmp_buffer[0];
491: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
492: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
493: }
494: }
496: // M = 4:
497: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
498: PetscInt size, PetscScalar *group_results)
499: {
500: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
501: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
502: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
503: PetscInt vec_start_index = blockIdx.x * entries_per_group;
504: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
506: PetscScalar entry_x = 0;
507: PetscScalar group_sum0 = 0;
508: PetscScalar group_sum1 = 0;
509: PetscScalar group_sum2 = 0;
510: PetscScalar group_sum3 = 0;
511: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
512: entry_x = x[i]; // load only once from global memory!
513: group_sum0 += entry_x * y0[i];
514: group_sum1 += entry_x * y1[i];
515: group_sum2 += entry_x * y2[i];
516: group_sum3 += entry_x * y3[i];
517: }
518: tmp_buffer[threadIdx.x] = group_sum0;
519: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
520: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
521: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
523: // parallel reduction
524: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
525: __syncthreads();
526: if (threadIdx.x < stride) {
527: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
528: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
529: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
530: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
531: }
532: }
534: // write result of group to group_results
535: if (threadIdx.x == 0) {
536: group_results[blockIdx.x ] = tmp_buffer[0];
537: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
538: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
539: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
540: }
541: }
543: // M = 8:
544: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
545: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
546: PetscInt size, PetscScalar *group_results)
547: {
548: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
549: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
550: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
551: PetscInt vec_start_index = blockIdx.x * entries_per_group;
552: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
554: PetscScalar entry_x = 0;
555: PetscScalar group_sum0 = 0;
556: PetscScalar group_sum1 = 0;
557: PetscScalar group_sum2 = 0;
558: PetscScalar group_sum3 = 0;
559: PetscScalar group_sum4 = 0;
560: PetscScalar group_sum5 = 0;
561: PetscScalar group_sum6 = 0;
562: PetscScalar group_sum7 = 0;
563: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
564: entry_x = x[i]; // load only once from global memory!
565: group_sum0 += entry_x * y0[i];
566: group_sum1 += entry_x * y1[i];
567: group_sum2 += entry_x * y2[i];
568: group_sum3 += entry_x * y3[i];
569: group_sum4 += entry_x * y4[i];
570: group_sum5 += entry_x * y5[i];
571: group_sum6 += entry_x * y6[i];
572: group_sum7 += entry_x * y7[i];
573: }
574: tmp_buffer[threadIdx.x] = group_sum0;
575: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
576: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
577: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
578: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
579: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
580: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
581: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
583: // parallel reduction
584: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
585: __syncthreads();
586: if (threadIdx.x < stride) {
587: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
588: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
589: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
590: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
591: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
592: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
593: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
594: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
595: }
596: }
598: // write result of group to group_results
599: if (threadIdx.x == 0) {
600: group_results[blockIdx.x ] = tmp_buffer[0];
601: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
602: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
603: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
604: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
605: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
606: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
607: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
608: }
609: }
610: #endif /* !defined(PETSC_USE_COMPLEX) */
612: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
613: {
614: PetscErrorCode ierr;
615: PetscInt i,n = xin->map->n,current_y_index = 0;
616: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
617: PetscScalar *group_results_gpu;
618: #if !defined(PETSC_USE_COMPLEX)
619: PetscInt j;
620: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
621: #endif
622: cudaError_t cuda_ierr;
623: PetscBLASInt one=1,bn;
624: cublasHandle_t cublasv2handle;
625: cublasStatus_t cberr;
628: PetscCUBLASGetHandle(&cublasv2handle);
629: PetscBLASIntCast(xin->map->n,&bn);
630: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
631: /* Handle the case of local size zero first */
632: if (!xin->map->n) {
633: for (i=0; i<nv; ++i) z[i] = 0;
634: return(0);
635: }
637: // allocate scratchpad memory for the results of individual work groups:
638: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
640: VecCUDAGetArrayRead(xin,&xptr);
642: while (current_y_index < nv)
643: {
644: switch (nv - current_y_index) {
646: case 7:
647: case 6:
648: case 5:
649: case 4:
650: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
651: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
652: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
653: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
654: PetscLogGpuTimeBegin();
655: #if defined(PETSC_USE_COMPLEX)
656: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
657: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
658: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
659: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
660: PetscLogGpuTimeEnd();
661: #else
662: // run kernel:
663: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
664: PetscLogGpuTimeEnd();
666: // copy results back to
667: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
669: // sum group results into z:
670: for (j=0; j<4; ++j) {
671: z[current_y_index + j] = 0;
672: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
673: }
674: #endif
675: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
676: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
677: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
678: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
679: current_y_index += 4;
680: break;
682: case 3:
683: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
684: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
685: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
687: PetscLogGpuTimeBegin();
688: #if defined(PETSC_USE_COMPLEX)
689: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
690: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
691: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
692: PetscLogGpuTimeEnd();
693: #else
694: // run kernel:
695: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
696: PetscLogGpuTimeEnd();
698: // copy results back to
699: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
701: // sum group results into z:
702: for (j=0; j<3; ++j) {
703: z[current_y_index + j] = 0;
704: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
705: }
706: #endif
707: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
708: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
709: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
710: current_y_index += 3;
711: break;
713: case 2:
714: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
715: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
716: PetscLogGpuTimeBegin();
717: #if defined(PETSC_USE_COMPLEX)
718: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
719: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
720: PetscLogGpuTimeEnd();
721: #else
722: // run kernel:
723: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
724: PetscLogGpuTimeEnd();
726: // copy results back to
727: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
729: // sum group results into z:
730: for (j=0; j<2; ++j) {
731: z[current_y_index + j] = 0;
732: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
733: }
734: #endif
735: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
736: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
737: current_y_index += 2;
738: break;
740: case 1:
741: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
742: PetscLogGpuTimeBegin();
743: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
744: PetscLogGpuTimeEnd();
745: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
746: current_y_index += 1;
747: break;
749: default: // 8 or more vectors left
750: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
751: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
752: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
753: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
754: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
755: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
756: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
757: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
758: PetscLogGpuTimeBegin();
759: #if defined(PETSC_USE_COMPLEX)
760: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
761: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
762: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
763: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
764: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
765: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
766: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
767: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
768: PetscLogGpuTimeEnd();
769: #else
770: // run kernel:
771: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
772: PetscLogGpuTimeEnd();
774: // copy results back to
775: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
777: // sum group results into z:
778: for (j=0; j<8; ++j) {
779: z[current_y_index + j] = 0;
780: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
781: }
782: #endif
783: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
784: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
785: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
786: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
787: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
788: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
789: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
790: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
791: current_y_index += 8;
792: break;
793: }
794: }
795: VecCUDARestoreArrayRead(xin,&xptr);
797: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
798: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
799: return(0);
800: }
802: #undef MDOT_WORKGROUP_SIZE
803: #undef MDOT_WORKGROUP_NUM
805: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
806: {
807: PetscInt n = xin->map->n;
808: PetscScalar *xarray=NULL;
809: thrust::device_ptr<PetscScalar> xptr;
810: PetscErrorCode ierr;
811: cudaError_t err;
814: VecCUDAGetArrayWrite(xin,&xarray);
815: PetscLogGpuTimeBegin();
816: if (alpha == (PetscScalar)0.0) {
817: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
818: } else {
819: try {
820: xptr = thrust::device_pointer_cast(xarray);
821: thrust::fill(xptr,xptr+n,alpha);
822: } catch (char *ex) {
823: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
824: }
825: }
826: err = WaitForCUDA();CHKERRCUDA(err);
827: PetscLogGpuTimeEnd();
828: VecCUDARestoreArrayWrite(xin,&xarray);
829: return(0);
830: }
832: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
833: {
834: PetscScalar *xarray;
836: PetscBLASInt one=1,bn;
837: cublasHandle_t cublasv2handle;
838: cublasStatus_t cberr;
839: cudaError_t err;
842: if (alpha == (PetscScalar)0.0) {
843: VecSet_SeqCUDA(xin,alpha);
844: err = WaitForCUDA();CHKERRCUDA(err);
845: } else if (alpha != (PetscScalar)1.0) {
846: PetscCUBLASGetHandle(&cublasv2handle);
847: PetscBLASIntCast(xin->map->n,&bn);
848: VecCUDAGetArray(xin,&xarray);
849: PetscLogGpuTimeBegin();
850: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
851: VecCUDARestoreArray(xin,&xarray);
852: err = WaitForCUDA();CHKERRCUDA(err);
853: PetscLogGpuTimeEnd();
854: }
855: PetscLogGpuFlops(xin->map->n);
856: return(0);
857: }
859: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
860: {
861: const PetscScalar *xarray,*yarray;
862: PetscErrorCode ierr;
863: PetscBLASInt one=1,bn;
864: cublasHandle_t cublasv2handle;
865: cublasStatus_t cerr;
868: PetscCUBLASGetHandle(&cublasv2handle);
869: PetscBLASIntCast(xin->map->n,&bn);
870: VecCUDAGetArrayRead(xin,&xarray);
871: VecCUDAGetArrayRead(yin,&yarray);
872: PetscLogGpuTimeBegin();
873: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
874: PetscLogGpuTimeEnd();
875: if (xin->map->n > 0) {
876: PetscLogGpuFlops(2.0*xin->map->n-1);
877: }
878: VecCUDARestoreArrayRead(yin,&yarray);
879: VecCUDARestoreArrayRead(xin,&xarray);
880: return(0);
881: }
883: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
884: {
885: const PetscScalar *xarray;
886: PetscScalar *yarray;
887: PetscErrorCode ierr;
888: cudaError_t err;
891: if (xin != yin) {
892: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
893: PetscBool yiscuda;
895: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
896: VecCUDAGetArrayRead(xin,&xarray);
897: if (yiscuda) {
898: VecCUDAGetArrayWrite(yin,&yarray);
899: } else {
900: VecGetArrayWrite(yin,&yarray);
901: }
902: PetscLogGpuTimeBegin();
903: if (yiscuda) {
904: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
905: } else {
906: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
907: }
908: err = WaitForCUDA();CHKERRCUDA(err);
909: PetscLogGpuTimeEnd();
910: VecCUDARestoreArrayRead(xin,&xarray);
911: if (yiscuda) {
912: VecCUDARestoreArrayWrite(yin,&yarray);
913: } else {
914: VecRestoreArrayWrite(yin,&yarray);
915: }
916: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
917: /* copy in CPU if we are on the CPU */
918: VecCopy_SeqCUDA_Private(xin,yin);
919: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
920: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
921: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
922: /* copy in CPU */
923: VecCopy_SeqCUDA_Private(xin,yin);
924: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
925: /* copy in GPU */
926: VecCUDAGetArrayRead(xin,&xarray);
927: VecCUDAGetArrayWrite(yin,&yarray);
928: PetscLogGpuTimeBegin();
929: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
930: PetscLogGpuTimeEnd();
931: VecCUDARestoreArrayRead(xin,&xarray);
932: VecCUDARestoreArrayWrite(yin,&yarray);
933: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
934: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
935: default to copy in GPU (this is an arbitrary choice) */
936: VecCUDAGetArrayRead(xin,&xarray);
937: VecCUDAGetArrayWrite(yin,&yarray);
938: PetscLogGpuTimeBegin();
939: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
940: PetscLogGpuTimeEnd();
941: VecCUDARestoreArrayRead(xin,&xarray);
942: VecCUDARestoreArrayWrite(yin,&yarray);
943: } else {
944: VecCopy_SeqCUDA_Private(xin,yin);
945: }
946: }
947: }
948: return(0);
949: }
951: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
952: {
954: PetscBLASInt one = 1,bn;
955: PetscScalar *xarray,*yarray;
956: cublasHandle_t cublasv2handle;
957: cublasStatus_t cberr;
958: cudaError_t err;
961: PetscCUBLASGetHandle(&cublasv2handle);
962: PetscBLASIntCast(xin->map->n,&bn);
963: if (xin != yin) {
964: VecCUDAGetArray(xin,&xarray);
965: VecCUDAGetArray(yin,&yarray);
966: PetscLogGpuTimeBegin();
967: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
968: err = WaitForCUDA();CHKERRCUDA(err);
969: PetscLogGpuTimeEnd();
970: VecCUDARestoreArray(xin,&xarray);
971: VecCUDARestoreArray(yin,&yarray);
972: }
973: return(0);
974: }
976: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
977: {
978: PetscErrorCode ierr;
979: PetscScalar a = alpha,b = beta;
980: const PetscScalar *xarray;
981: PetscScalar *yarray;
982: PetscBLASInt one = 1, bn;
983: cublasHandle_t cublasv2handle;
984: cublasStatus_t cberr;
985: cudaError_t err;
988: PetscCUBLASGetHandle(&cublasv2handle);
989: PetscBLASIntCast(yin->map->n,&bn);
990: if (a == (PetscScalar)0.0) {
991: VecScale_SeqCUDA(yin,beta);
992: } else if (b == (PetscScalar)1.0) {
993: VecAXPY_SeqCUDA(yin,alpha,xin);
994: } else if (a == (PetscScalar)1.0) {
995: VecAYPX_SeqCUDA(yin,beta,xin);
996: } else if (b == (PetscScalar)0.0) {
997: VecCUDAGetArrayRead(xin,&xarray);
998: VecCUDAGetArray(yin,&yarray);
999: PetscLogGpuTimeBegin();
1000: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
1001: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
1002: err = WaitForCUDA();CHKERRCUDA(err);
1003: PetscLogGpuTimeEnd();
1004: PetscLogGpuFlops(xin->map->n);
1005: VecCUDARestoreArrayRead(xin,&xarray);
1006: VecCUDARestoreArray(yin,&yarray);
1007: } else {
1008: VecCUDAGetArrayRead(xin,&xarray);
1009: VecCUDAGetArray(yin,&yarray);
1010: PetscLogGpuTimeBegin();
1011: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1012: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1013: VecCUDARestoreArrayRead(xin,&xarray);
1014: VecCUDARestoreArray(yin,&yarray);
1015: err = WaitForCUDA();CHKERRCUDA(err);
1016: PetscLogGpuTimeEnd();
1017: PetscLogGpuFlops(3.0*xin->map->n);
1018: }
1019: return(0);
1020: }
1022: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1023: {
1025: cudaError_t err;
1026: PetscInt n = zin->map->n;
1029: if (gamma == (PetscScalar)1.0) {
1030: /* z = ax + b*y + z */
1031: VecAXPY_SeqCUDA(zin,alpha,xin);
1032: VecAXPY_SeqCUDA(zin,beta,yin);
1033: PetscLogGpuFlops(4.0*n);
1034: } else {
1035: /* z = a*x + b*y + c*z */
1036: VecScale_SeqCUDA(zin,gamma);
1037: VecAXPY_SeqCUDA(zin,alpha,xin);
1038: VecAXPY_SeqCUDA(zin,beta,yin);
1039: PetscLogGpuFlops(5.0*n);
1040: }
1041: err = WaitForCUDA();CHKERRCUDA(err);
1042: return(0);
1043: }
1045: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1046: {
1047: PetscInt n = win->map->n;
1048: const PetscScalar *xarray,*yarray;
1049: PetscScalar *warray;
1050: thrust::device_ptr<const PetscScalar> xptr,yptr;
1051: thrust::device_ptr<PetscScalar> wptr;
1052: PetscErrorCode ierr;
1053: cudaError_t err;
1056: VecCUDAGetArray(win,&warray);
1057: VecCUDAGetArrayRead(xin,&xarray);
1058: VecCUDAGetArrayRead(yin,&yarray);
1059: PetscLogGpuTimeBegin();
1060: try {
1061: wptr = thrust::device_pointer_cast(warray);
1062: xptr = thrust::device_pointer_cast(xarray);
1063: yptr = thrust::device_pointer_cast(yarray);
1064: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1065: err = WaitForCUDA();CHKERRCUDA(err);
1066: } catch (char *ex) {
1067: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1068: }
1069: PetscLogGpuTimeEnd();
1070: VecCUDARestoreArrayRead(xin,&xarray);
1071: VecCUDARestoreArrayRead(yin,&yarray);
1072: VecCUDARestoreArray(win,&warray);
1073: PetscLogGpuFlops(n);
1074: return(0);
1075: }
1077: /* should do infinity norm in cuda */
1079: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1080: {
1081: PetscErrorCode ierr;
1082: PetscInt n = xin->map->n;
1083: PetscBLASInt one = 1, bn;
1084: const PetscScalar *xarray;
1085: cublasHandle_t cublasv2handle;
1086: cublasStatus_t cberr;
1087: cudaError_t err;
1090: PetscCUBLASGetHandle(&cublasv2handle);
1091: PetscBLASIntCast(n,&bn);
1092: if (type == NORM_2 || type == NORM_FROBENIUS) {
1093: VecCUDAGetArrayRead(xin,&xarray);
1094: PetscLogGpuTimeBegin();
1095: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1096: PetscLogGpuTimeEnd();
1097: VecCUDARestoreArrayRead(xin,&xarray);
1098: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1099: } else if (type == NORM_INFINITY) {
1100: int i;
1101: VecCUDAGetArrayRead(xin,&xarray);
1102: PetscLogGpuTimeBegin();
1103: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1104: PetscLogGpuTimeEnd();
1105: if (bn) {
1106: PetscScalar zs;
1107: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1108: *z = PetscAbsScalar(zs);
1109: } else *z = 0.0;
1110: VecCUDARestoreArrayRead(xin,&xarray);
1111: } else if (type == NORM_1) {
1112: VecCUDAGetArrayRead(xin,&xarray);
1113: PetscLogGpuTimeBegin();
1114: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1115: VecCUDARestoreArrayRead(xin,&xarray);
1116: PetscLogGpuTimeEnd();
1117: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1118: } else if (type == NORM_1_AND_2) {
1119: VecNorm_SeqCUDA(xin,NORM_1,z);
1120: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1121: }
1122: return(0);
1123: }
1125: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1126: {
1127: PetscErrorCode ierr;
1128: cudaError_t err;
1129: PetscReal n=s->map->n;
1130: const PetscScalar *sarray,*tarray;
1133: VecCUDAGetArrayRead(s,&sarray);
1134: VecCUDAGetArrayRead(t,&tarray);
1135: VecDot_SeqCUDA(s,t,dp);
1136: VecDot_SeqCUDA(t,t,nm);
1137: VecCUDARestoreArrayRead(s,&sarray);
1138: VecCUDARestoreArrayRead(t,&tarray);
1139: err = WaitForCUDA();CHKERRCUDA(err);
1140: PetscLogGpuFlops(4.0*n);
1141: return(0);
1142: }
1144: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1145: {
1147: cudaError_t err;
1150: if (v->spptr) {
1151: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1152: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1153: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1154: }
1155: if (((Vec_CUDA*)v->spptr)->stream) {
1156: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1157: }
1158: }
1159: VecDestroy_SeqCUDA_Private(v);
1160: PetscFree(v->spptr);
1161: return(0);
1162: }
1164: #if defined(PETSC_USE_COMPLEX)
1165: struct conjugate
1166: {
1167: __host__ __device__
1168: PetscScalar operator()(PetscScalar x)
1169: {
1170: return PetscConj(x);
1171: }
1172: };
1173: #endif
1175: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1176: {
1177: #if defined(PETSC_USE_COMPLEX)
1178: PetscScalar *xarray;
1179: PetscErrorCode ierr;
1180: PetscInt n = xin->map->n;
1181: thrust::device_ptr<PetscScalar> xptr;
1182: cudaError_t err;
1185: VecCUDAGetArray(xin,&xarray);
1186: PetscLogGpuTimeBegin();
1187: try {
1188: xptr = thrust::device_pointer_cast(xarray);
1189: thrust::transform(xptr,xptr+n,xptr,conjugate());
1190: err = WaitForCUDA();CHKERRCUDA(err);
1191: } catch (char *ex) {
1192: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1193: }
1194: PetscLogGpuTimeEnd();
1195: VecCUDARestoreArray(xin,&xarray);
1196: #else
1198: #endif
1199: return(0);
1200: }
1202: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1203: {
1205: cudaError_t err;
1212: if (w->data) {
1213: if (((Vec_Seq*)w->data)->array_allocated) {
1214: if (w->pinned_memory) {
1215: PetscMallocSetCUDAHost();
1216: }
1217: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1218: if (w->pinned_memory) {
1219: PetscMallocResetCUDAHost();
1220: w->pinned_memory = PETSC_FALSE;
1221: }
1222: }
1223: ((Vec_Seq*)w->data)->array = NULL;
1224: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1225: }
1226: if (w->spptr) {
1227: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1228: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1229: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1230: }
1231: if (((Vec_CUDA*)v->spptr)->stream) {
1232: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1233: }
1234: PetscFree(w->spptr);
1235: }
1237: if (v->petscnative) {
1238: PetscFree(w->data);
1239: w->data = v->data;
1240: w->offloadmask = v->offloadmask;
1241: w->pinned_memory = v->pinned_memory;
1242: w->spptr = v->spptr;
1243: PetscObjectStateIncrease((PetscObject)w);
1244: } else {
1245: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1246: w->offloadmask = PETSC_OFFLOAD_CPU;
1247: VecCUDAAllocateCheck(w);
1248: }
1249: return(0);
1250: }
1252: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1253: {
1255: cudaError_t err;
1262: if (v->petscnative) {
1263: v->data = w->data;
1264: v->offloadmask = w->offloadmask;
1265: v->pinned_memory = w->pinned_memory;
1266: v->spptr = w->spptr;
1267: VecCUDACopyFromGPU(v);
1268: PetscObjectStateIncrease((PetscObject)v);
1269: w->data = 0;
1270: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1271: w->spptr = 0;
1272: } else {
1273: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1274: if ((Vec_CUDA*)w->spptr) {
1275: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1276: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1277: if (((Vec_CUDA*)v->spptr)->stream) {
1278: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1279: }
1280: PetscFree(w->spptr);
1281: }
1282: }
1283: return(0);
1284: }