Actual source code: veccusp.cu
petsc-3.5.4 2015-05-23
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #include <petscconf.h>
6: PETSC_CUDA_EXTERN_C_BEGIN
7: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: PETSC_CUDA_EXTERN_C_END
10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
12: #include <cuda_runtime.h>
16: /*
17: Allocates space for the vector array on the Host if it does not exist.
18: Does NOT change the PetscCUSPFlag for the vector
19: Does NOT zero the CUSP array
20: */
21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
22: {
24: cudaError_t err;
25: PetscScalar *array;
26: Vec_Seq *s;
27: PetscInt n = v->map->n;
30: s = (Vec_Seq*)v->data;
31: VecCUSPAllocateCheck(v);
32: if (s->array == 0) {
33: PetscMalloc1(n,&array);
34: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
35: s->array = array;
36: s->array_allocated = array;
37: err = cudaHostRegister(s->array, n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
38: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
39: }
40: return(0);
41: }
46: /*
47: Allocates space for the vector array on the GPU if it does not exist.
48: Does NOT change the PetscCUSPFlag for the vector
49: Does NOT zero the CUSP array
51: */
52: PetscErrorCode VecCUSPAllocateCheck(Vec v)
53: {
54: cudaError_t err;
55: cudaStream_t stream;
56: Vec_Seq *s = (Vec_Seq*)v->data;
59: // First allocate memory on the GPU if needed
60: if (!v->spptr) {
61: try {
62: v->spptr = new Vec_CUSP;
63: ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
64: ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
65: err = cudaStreamCreate(&stream);CHKERRCUSP(err);
66: ((Vec_CUSP*)v->spptr)->stream = stream;
68: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_FALSE;
69: /* If the array is already allocated, one can register it as (page-locked) mapped.
70: This can substantially accelerate data transfer across the PCI Express */
71: if (s->array) {
72: err = cudaHostRegister(s->array, v->map->n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
73: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
74: }
75: v->ops->destroy = VecDestroy_SeqCUSP;
76: } catch(char *ex) {
77: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
78: }
79: }
80: return(0);
81: }
86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
87: PetscErrorCode VecCUSPCopyToGPU(Vec v)
88: {
90: cudaError_t err;
91: Vec_CUSP *veccusp;
92: CUSPARRAY *varray;
93: cudaStream_t stream;
96: VecCUSPAllocateCheck(v);
97: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
98: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
99: try {
100: veccusp=(Vec_CUSP*)v->spptr;
101: varray=veccusp->GPUarray;
102: stream=veccusp->stream;
103: err = cudaMemcpyAsync(varray->data().get(), *(PetscScalar**)v->data, v->map->n*sizeof(PetscScalar),
104: cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
105: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
106: } catch(char *ex) {
107: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
108: }
109: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
110: v->valid_GPU_array = PETSC_CUSP_BOTH;
111: }
112: return(0);
113: }
117: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
118: {
119: CUSPARRAY *varray;
121: cudaError_t err;
122: PetscScalar *cpuPtr, *gpuPtr;
123: cudaStream_t stream;
124: Vec_Seq *s;
125: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
128: VecCUSPAllocateCheck(v);
129: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
130: stream=((Vec_CUSP*)v->spptr)->stream;
131: s = (Vec_Seq*)v->data;
133: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
134: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
135: gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
136: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
138: /* Note : this code copies the smallest contiguous chunk of data
139: containing ALL of the indices */
140: err = cudaMemcpyAsync(gpuPtr, cpuPtr, ptop_scatter->nr*sizeof(PetscScalar),
141: cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
142: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
144: #if 0
145: Vec_Seq *s;
146: s = (Vec_Seq*)v->data;
148: CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
149: CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;
151: thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
152: thrust::make_permutation_iterator(s->array,indicesCPU->end()),
153: thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
154: #endif
155: // Set the buffer states
156: v->valid_GPU_array = PETSC_CUSP_BOTH;
157: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
158: }
159: return(0);
160: }
165: /*
166: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
167: */
168: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
169: {
171: cudaError_t err;
172: Vec_CUSP *veccusp;
173: CUSPARRAY *varray;
174: cudaStream_t stream;
177: VecCUSPAllocateCheckHost(v);
178: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
179: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
180: try {
181: veccusp=(Vec_CUSP*)v->spptr;
182: varray=veccusp->GPUarray;
183: stream=veccusp->stream;
185: err = cudaMemcpyAsync(*(PetscScalar**)v->data, varray->data().get(), v->map->n*sizeof(PetscScalar),
186: cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
187: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
188: } catch(char *ex) {
189: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190: }
191: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192: v->valid_GPU_array = PETSC_CUSP_BOTH;
193: }
194: return(0);
195: }
199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200: which means that we need recombine the data at some point before using any of the standard functions.
201: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202: where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206: CUSPARRAY *varray;
208: cudaError_t err;
209: PetscScalar *cpuPtr, *gpuPtr;
210: cudaStream_t stream;
211: Vec_Seq *s;
212: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
215: VecCUSPAllocateCheckHost(v);
216: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
217: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
219: stream=((Vec_CUSP*)v->spptr)->stream;
220: varray=((Vec_CUSP*)v->spptr)->GPUarray;
221: s = (Vec_Seq*)v->data;
222: gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
223: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
225: /* Note : this code copies the smallest contiguous chunk of data
226: containing ALL of the indices */
227: err = cudaMemcpyAsync(cpuPtr, gpuPtr, ptop_scatter->ns*sizeof(PetscScalar),
228: cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
229: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
231: #if 0
232: Vec_Seq *s;
233: s = (Vec_Seq*)v->data;
234: CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
235: CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;
237: thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
238: thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
239: thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
240: #endif
241: VecCUSPRestoreArrayRead(v,&varray);
242: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
243: v->valid_GPU_array = PETSC_CUSP_BOTH;
244: }
245: return(0);
246: }
250: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
251: {
252: PetscScalar *ya;
253: const PetscScalar *xa;
254: PetscErrorCode ierr;
257: VecCUSPAllocateCheckHost(xin);
258: VecCUSPAllocateCheckHost(yin);
259: if (xin != yin) {
260: VecGetArrayRead(xin,&xa);
261: VecGetArray(yin,&ya);
262: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
263: VecRestoreArrayRead(xin,&xa);
264: VecRestoreArray(yin,&ya);
265: }
266: return(0);
267: }
271: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
272: {
274: PetscInt n = xin->map->n,i;
275: PetscScalar *xx;
278: VecGetArray(xin,&xx);
279: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
280: VecRestoreArray(xin,&xx);
281: return(0);
282: }
286: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
287: {
288: Vec_Seq *vs = (Vec_Seq*)v->data;
292: PetscObjectSAWsViewOff(v);
293: #if defined(PETSC_USE_LOG)
294: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
295: #endif
296: if (vs->array_allocated) PetscFree(vs->array_allocated);
297: PetscFree(vs);
298: return(0);
299: }
303: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
304: {
305: Vec_Seq *v = (Vec_Seq*)vin->data;
308: v->array = v->unplacedarray;
309: v->unplacedarray = 0;
310: return(0);
311: }
313: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
316: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
317: {
321: VecCUSPAllocateCheck(v);
322: return(0);
323: }
327: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
328: {
332: VecCUSPCopyToGPU(v);
333: return(0);
334: }
340: /*
341: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
343: Input Parameters:
344: + v - the vector
345: - indices - the requested indices, this should be created with CUSPIndicesCreate()
347: */
348: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
349: {
353: VecCUSPCopyToGPUSome(v,ci);
354: return(0);
355: }
359: /*
360: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
362: Input Parameters:
363: + v - the vector
364: - indices - the requested indices, this should be created with CUSPIndicesCreate()
365: */
366: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
367: {
371: VecCUSPCopyFromGPUSome(v,ci);
372: return(0);
373: }
375: /*MC
376: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
378: Options Database Keys:
379: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
381: Level: beginner
383: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
384: M*/
386: /* for VecAYPX_SeqCUSP*/
387: namespace cusp
388: {
389: namespace blas
390: {
391: namespace detail
392: {
393: template <typename T>
394: struct AYPX : public thrust::binary_function<T,T,T>
395: {
396: T alpha;
398: AYPX(T _alpha) : alpha(_alpha) {}
400: __host__ __device__
401: T operator()(T x, T y)
402: {
403: return alpha * y + x;
404: }
405: };
406: }
408: template <typename ForwardIterator1,
409: typename ForwardIterator2,
410: typename ScalarType>
411: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
412: {
413: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
414: }
415: template <typename Array1, typename Array2, typename ScalarType>
416: void aypx(const Array1& x, Array2& y, ScalarType alpha)
417: {
418: detail::assert_same_dimensions(x,y);
419: aypx(x.begin(),x.end(),y.begin(),alpha);
420: }
421: }
422: }
426: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
427: {
428: CUSPARRAY *xarray,*yarray;
432: VecCUSPGetArrayRead(xin,&xarray);
433: VecCUSPGetArrayReadWrite(yin,&yarray);
434: try {
435: if (alpha != 0.0) {
436: cusp::blas::aypx(*xarray,*yarray,alpha);
437: PetscLogFlops(2.0*yin->map->n);
438: } else {
439: cusp::blas::copy(*xarray,*yarray);
440: }
441: WaitForGPU();CHKERRCUSP(ierr);
442: } catch(char *ex) {
443: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
444: }
445: VecCUSPRestoreArrayRead(xin,&xarray);
446: VecCUSPRestoreArrayReadWrite(yin,&yarray);
447: return(0);
448: }
453: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
454: {
455: CUSPARRAY *xarray,*yarray;
459: if (alpha != 0.0) {
460: VecCUSPGetArrayRead(xin,&xarray);
461: VecCUSPGetArrayReadWrite(yin,&yarray);
462: try {
463: cusp::blas::axpy(*xarray,*yarray,alpha);
464: WaitForGPU();CHKERRCUSP(ierr);
465: } catch(char *ex) {
466: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
467: }
468: VecCUSPRestoreArrayRead(xin,&xarray);
469: VecCUSPRestoreArrayReadWrite(yin,&yarray);
470: PetscLogFlops(2.0*yin->map->n);
471: }
472: return(0);
473: }
475: struct VecCUSPPointwiseDivide
476: {
477: template <typename Tuple>
478: __host__ __device__
479: void operator()(Tuple t)
480: {
481: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
482: }
483: };
487: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
488: {
489: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
493: VecCUSPGetArrayRead(xin,&xarray);
494: VecCUSPGetArrayRead(yin,&yarray);
495: VecCUSPGetArrayWrite(win,&warray);
496: try {
497: thrust::for_each(
498: thrust::make_zip_iterator(
499: thrust::make_tuple(
500: warray->begin(),
501: xarray->begin(),
502: yarray->begin())),
503: thrust::make_zip_iterator(
504: thrust::make_tuple(
505: warray->end(),
506: xarray->end(),
507: yarray->end())),
508: VecCUSPPointwiseDivide());
509: WaitForGPU();CHKERRCUSP(ierr);
510: } catch(char *ex) {
511: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
512: }
513: PetscLogFlops(win->map->n);
514: VecCUSPRestoreArrayRead(xin,&xarray);
515: VecCUSPRestoreArrayRead(yin,&yarray);
516: VecCUSPRestoreArrayWrite(win,&warray);
517: return(0);
518: }
521: struct VecCUSPWAXPY
522: {
523: template <typename Tuple>
524: __host__ __device__
525: void operator()(Tuple t)
526: {
527: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
528: }
529: };
531: struct VecCUSPSum
532: {
533: template <typename Tuple>
534: __host__ __device__
535: void operator()(Tuple t)
536: {
537: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
538: }
539: };
541: struct VecCUSPDiff
542: {
543: template <typename Tuple>
544: __host__ __device__
545: void operator()(Tuple t)
546: {
547: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
548: }
549: };
553: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
554: {
555: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
559: if (alpha == 0.0) {
560: VecCopy_SeqCUSP(yin,win);
561: } else {
562: VecCUSPGetArrayRead(xin,&xarray);
563: VecCUSPGetArrayRead(yin,&yarray);
564: VecCUSPGetArrayWrite(win,&warray);
565: if (alpha == 1.0) {
566: try {
567: thrust::for_each(
568: thrust::make_zip_iterator(
569: thrust::make_tuple(
570: warray->begin(),
571: yarray->begin(),
572: xarray->begin())),
573: thrust::make_zip_iterator(
574: thrust::make_tuple(
575: warray->end(),
576: yarray->end(),
577: xarray->end())),
578: VecCUSPSum());
579: } catch(char *ex) {
580: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
581: }
582: PetscLogFlops(win->map->n);
583: } else if (alpha == -1.0) {
584: try {
585: thrust::for_each(
586: thrust::make_zip_iterator(
587: thrust::make_tuple(
588: warray->begin(),
589: yarray->begin(),
590: xarray->begin())),
591: thrust::make_zip_iterator(
592: thrust::make_tuple(
593: warray->end(),
594: yarray->end(),
595: xarray->end())),
596: VecCUSPDiff());
597: } catch(char *ex) {
598: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
599: }
600: PetscLogFlops(win->map->n);
601: } else {
602: try {
603: thrust::for_each(
604: thrust::make_zip_iterator(
605: thrust::make_tuple(
606: warray->begin(),
607: yarray->begin(),
608: thrust::make_constant_iterator(alpha),
609: xarray->begin())),
610: thrust::make_zip_iterator(
611: thrust::make_tuple(
612: warray->end(),
613: yarray->end(),
614: thrust::make_constant_iterator(alpha),
615: xarray->end())),
616: VecCUSPWAXPY());
617: } catch(char *ex) {
618: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
619: }
620: PetscLogFlops(2*win->map->n);
621: }
622: WaitForGPU();CHKERRCUSP(ierr);
623: VecCUSPRestoreArrayRead(xin,&xarray);
624: VecCUSPRestoreArrayRead(yin,&yarray);
625: VecCUSPRestoreArrayWrite(win,&warray);
626: }
627: return(0);
628: }
630: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
631: struct VecCUSPMAXPY4
632: {
633: template <typename Tuple>
634: __host__ __device__
635: void operator()(Tuple t)
636: {
637: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
638: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
639: }
640: };
643: struct VecCUSPMAXPY3
644: {
645: template <typename Tuple>
646: __host__ __device__
647: void operator()(Tuple t)
648: {
649: /*y += a1*x1 +a2*x2 + a3*x3 */
650: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
651: }
652: };
654: struct VecCUSPMAXPY2
655: {
656: template <typename Tuple>
657: __host__ __device__
658: void operator()(Tuple t)
659: {
660: /*y += a1*x1 +a2*x2*/
661: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
662: }
663: };
666: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
667: {
669: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
670: PetscInt n = xin->map->n,j,j_rem;
671: PetscScalar alpha0,alpha1,alpha2,alpha3;
674: PetscLogFlops(nv*2.0*n);
675: VecCUSPGetArrayReadWrite(xin,&xarray);
676: switch (j_rem=nv&0x3) {
677: case 3:
678: alpha0 = alpha[0];
679: alpha1 = alpha[1];
680: alpha2 = alpha[2];
681: alpha += 3;
682: VecCUSPGetArrayRead(y[0],&yy0);
683: VecCUSPGetArrayRead(y[1],&yy1);
684: VecCUSPGetArrayRead(y[2],&yy2);
685: try {
686: thrust::for_each(
687: thrust::make_zip_iterator(
688: thrust::make_tuple(
689: xarray->begin(),
690: thrust::make_constant_iterator(alpha0),
691: yy0->begin(),
692: thrust::make_constant_iterator(alpha1),
693: yy1->begin(),
694: thrust::make_constant_iterator(alpha2),
695: yy2->begin())),
696: thrust::make_zip_iterator(
697: thrust::make_tuple(
698: xarray->end(),
699: thrust::make_constant_iterator(alpha0),
700: yy0->end(),
701: thrust::make_constant_iterator(alpha1),
702: yy1->end(),
703: thrust::make_constant_iterator(alpha2),
704: yy2->end())),
705: VecCUSPMAXPY3());
706: } catch(char *ex) {
707: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
708: }
709: VecCUSPRestoreArrayRead(y[0],&yy0);
710: VecCUSPRestoreArrayRead(y[1],&yy1);
711: VecCUSPRestoreArrayRead(y[2],&yy2);
712: y += 3;
713: break;
714: case 2:
715: alpha0 = alpha[0];
716: alpha1 = alpha[1];
717: alpha +=2;
718: VecCUSPGetArrayRead(y[0],&yy0);
719: VecCUSPGetArrayRead(y[1],&yy1);
720: try {
721: thrust::for_each(
722: thrust::make_zip_iterator(
723: thrust::make_tuple(
724: xarray->begin(),
725: thrust::make_constant_iterator(alpha0),
726: yy0->begin(),
727: thrust::make_constant_iterator(alpha1),
728: yy1->begin())),
729: thrust::make_zip_iterator(
730: thrust::make_tuple(
731: xarray->end(),
732: thrust::make_constant_iterator(alpha0),
733: yy0->end(),
734: thrust::make_constant_iterator(alpha1),
735: yy1->end())),
736: VecCUSPMAXPY2());
737: } catch(char *ex) {
738: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
739: }
740: y +=2;
741: break;
742: case 1:
743: alpha0 = *alpha++;
744: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
745: y +=1;
746: break;
747: }
748: for (j=j_rem; j<nv; j+=4) {
749: alpha0 = alpha[0];
750: alpha1 = alpha[1];
751: alpha2 = alpha[2];
752: alpha3 = alpha[3];
753: alpha += 4;
754: VecCUSPGetArrayRead(y[0],&yy0);
755: VecCUSPGetArrayRead(y[1],&yy1);
756: VecCUSPGetArrayRead(y[2],&yy2);
757: VecCUSPGetArrayRead(y[3],&yy3);
758: try {
759: thrust::for_each(
760: thrust::make_zip_iterator(
761: thrust::make_tuple(
762: xarray->begin(),
763: thrust::make_constant_iterator(alpha0),
764: yy0->begin(),
765: thrust::make_constant_iterator(alpha1),
766: yy1->begin(),
767: thrust::make_constant_iterator(alpha2),
768: yy2->begin(),
769: thrust::make_constant_iterator(alpha3),
770: yy3->begin())),
771: thrust::make_zip_iterator(
772: thrust::make_tuple(
773: xarray->end(),
774: thrust::make_constant_iterator(alpha0),
775: yy0->end(),
776: thrust::make_constant_iterator(alpha1),
777: yy1->end(),
778: thrust::make_constant_iterator(alpha2),
779: yy2->end(),
780: thrust::make_constant_iterator(alpha3),
781: yy3->end())),
782: VecCUSPMAXPY4());
783: } catch(char *ex) {
784: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
785: }
786: VecCUSPRestoreArrayRead(y[0],&yy0);
787: VecCUSPRestoreArrayRead(y[1],&yy1);
788: VecCUSPRestoreArrayRead(y[2],&yy2);
789: VecCUSPRestoreArrayRead(y[3],&yy3);
790: y += 4;
791: }
792: VecCUSPRestoreArrayReadWrite(xin,&xarray);
793: WaitForGPU();CHKERRCUSP(ierr);
794: return(0);
795: }
800: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
801: {
802: CUSPARRAY *xarray,*yarray;
804: // PetscScalar *xptr,*yptr,*zgpu;
805: //PetscReal tmp;
808: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
809: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
810: VecCUSPGetArrayRead(xin,&xarray);
811: VecCUSPGetArrayRead(yin,&yarray);
812: try {
813: #if defined(PETSC_USE_COMPLEX)
814: *z = cusp::blas::dotc(*yarray,*xarray);
815: #else
816: *z = cusp::blas::dot(*yarray,*xarray);
817: #endif
818: } catch(char *ex) {
819: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
820: }
821: WaitForGPU();CHKERRCUSP(ierr);
822: if (xin->map->n >0) {
823: PetscLogFlops(2.0*xin->map->n-1);
824: }
825: VecCUSPRestoreArrayRead(xin,&xarray);
826: VecCUSPRestoreArrayRead(yin,&yarray);
827: return(0);
828: }
830: //
831: // CUDA kernels for MDot to follow
832: //
834: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
835: #define MDOT_WORKGROUP_SIZE 128
836: #define MDOT_WORKGROUP_NUM 128
838: // M = 2:
839: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
840: PetscInt size, PetscScalar *group_results)
841: {
842: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
843: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
844: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
845: PetscInt vec_start_index = blockIdx.x * entries_per_group;
846: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
848: PetscScalar entry_x = 0;
849: PetscScalar group_sum0 = 0;
850: PetscScalar group_sum1 = 0;
851: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
852: entry_x = x[i]; // load only once from global memory!
853: group_sum0 += entry_x * y0[i];
854: group_sum1 += entry_x * y1[i];
855: }
856: tmp_buffer[threadIdx.x] = group_sum0;
857: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
859: // parallel reduction
860: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
861: __syncthreads();
862: if (threadIdx.x < stride) {
863: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
864: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
865: }
866: }
868: // write result of group to group_results
869: if (threadIdx.x == 0) {
870: group_results[blockIdx.x] = tmp_buffer[0];
871: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
872: }
873: }
875: // M = 3:
876: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
877: PetscInt size, PetscScalar *group_results)
878: {
879: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
880: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
881: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
882: PetscInt vec_start_index = blockIdx.x * entries_per_group;
883: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
885: PetscScalar entry_x = 0;
886: PetscScalar group_sum0 = 0;
887: PetscScalar group_sum1 = 0;
888: PetscScalar group_sum2 = 0;
889: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
890: entry_x = x[i]; // load only once from global memory!
891: group_sum0 += entry_x * y0[i];
892: group_sum1 += entry_x * y1[i];
893: group_sum2 += entry_x * y2[i];
894: }
895: tmp_buffer[threadIdx.x] = group_sum0;
896: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
897: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
899: // parallel reduction
900: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
901: __syncthreads();
902: if (threadIdx.x < stride) {
903: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
904: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
905: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
906: }
907: }
909: // write result of group to group_results
910: if (threadIdx.x == 0) {
911: group_results[blockIdx.x ] = tmp_buffer[0];
912: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
913: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
914: }
915: }
917: // M = 4:
918: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
919: PetscInt size, PetscScalar *group_results)
920: {
921: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
922: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
923: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
924: PetscInt vec_start_index = blockIdx.x * entries_per_group;
925: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
927: PetscScalar entry_x = 0;
928: PetscScalar group_sum0 = 0;
929: PetscScalar group_sum1 = 0;
930: PetscScalar group_sum2 = 0;
931: PetscScalar group_sum3 = 0;
932: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
933: entry_x = x[i]; // load only once from global memory!
934: group_sum0 += entry_x * y0[i];
935: group_sum1 += entry_x * y1[i];
936: group_sum2 += entry_x * y2[i];
937: group_sum3 += entry_x * y3[i];
938: }
939: tmp_buffer[threadIdx.x] = group_sum0;
940: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
941: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
942: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
944: // parallel reduction
945: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
946: __syncthreads();
947: if (threadIdx.x < stride) {
948: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
949: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
950: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
951: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
952: }
953: }
955: // write result of group to group_results
956: if (threadIdx.x == 0) {
957: group_results[blockIdx.x ] = tmp_buffer[0];
958: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
959: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
960: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
961: }
962: }
964: // M = 8:
965: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
966: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
967: PetscInt size, PetscScalar *group_results)
968: {
969: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
970: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
971: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
972: PetscInt vec_start_index = blockIdx.x * entries_per_group;
973: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
975: PetscScalar entry_x = 0;
976: PetscScalar group_sum0 = 0;
977: PetscScalar group_sum1 = 0;
978: PetscScalar group_sum2 = 0;
979: PetscScalar group_sum3 = 0;
980: PetscScalar group_sum4 = 0;
981: PetscScalar group_sum5 = 0;
982: PetscScalar group_sum6 = 0;
983: PetscScalar group_sum7 = 0;
984: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
985: entry_x = x[i]; // load only once from global memory!
986: group_sum0 += entry_x * y0[i];
987: group_sum1 += entry_x * y1[i];
988: group_sum2 += entry_x * y2[i];
989: group_sum3 += entry_x * y3[i];
990: group_sum4 += entry_x * y4[i];
991: group_sum5 += entry_x * y5[i];
992: group_sum6 += entry_x * y6[i];
993: group_sum7 += entry_x * y7[i];
994: }
995: tmp_buffer[threadIdx.x] = group_sum0;
996: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
997: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
998: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
999: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
1000: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
1001: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1002: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
1004: // parallel reduction
1005: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1006: __syncthreads();
1007: if (threadIdx.x < stride) {
1008: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1009: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1010: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1011: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1012: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1013: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1014: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1015: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1016: }
1017: }
1019: // write result of group to group_results
1020: if (threadIdx.x == 0) {
1021: group_results[blockIdx.x ] = tmp_buffer[0];
1022: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
1023: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1024: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1025: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1026: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1027: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1028: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1029: }
1030: }
1035: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1036: {
1038: PetscInt i,j,n = xin->map->n,current_y_index = 0;
1039: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1040: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1041: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1042: cudaError_t cuda_ierr;
1045: // allocate scratchpad memory for the results of individual work groups:
1046: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1047: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1048: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
1050: VecCUSPGetArrayRead(xin,&xarray);
1051: xptr = thrust::raw_pointer_cast(xarray->data());
1053: while (current_y_index < nv)
1054: {
1055: switch (nv - current_y_index) {
1057: case 7:
1058: case 6:
1059: case 5:
1060: case 4:
1061: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1062: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1063: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1064: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1066: #if defined(PETSC_USE_COMPLEX)
1067: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1068: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1069: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1070: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1071: #else
1072: // extract raw device pointers:
1073: y0ptr = thrust::raw_pointer_cast(y0array->data());
1074: y1ptr = thrust::raw_pointer_cast(y1array->data());
1075: y2ptr = thrust::raw_pointer_cast(y2array->data());
1076: y3ptr = thrust::raw_pointer_cast(y3array->data());
1078: // run kernel:
1079: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
1081: // copy results back to
1082: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1083: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1085: // sum group results into z:
1086: for (j=0; j<4; ++j) {
1087: z[current_y_index + j] = 0;
1088: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1089: }
1090: #endif
1091: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1092: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1093: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1094: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1095: current_y_index += 4;
1096: break;
1098: case 3:
1099: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1100: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1101: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1103: #if defined(PETSC_USE_COMPLEX)
1104: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1105: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1106: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1107: #else
1108: // extract raw device pointers:
1109: y0ptr = thrust::raw_pointer_cast(y0array->data());
1110: y1ptr = thrust::raw_pointer_cast(y1array->data());
1111: y2ptr = thrust::raw_pointer_cast(y2array->data());
1113: // run kernel:
1114: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
1116: // copy results back to
1117: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1118: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1120: // sum group results into z:
1121: for (j=0; j<3; ++j) {
1122: z[current_y_index + j] = 0;
1123: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1124: }
1125: #endif
1127: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1128: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1129: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1130: current_y_index += 3;
1131: break;
1133: case 2:
1134: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1135: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1137: #if defined(PETSC_USE_COMPLEX)
1138: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1139: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1140: #else
1141: // extract raw device pointers:
1142: y0ptr = thrust::raw_pointer_cast(y0array->data());
1143: y1ptr = thrust::raw_pointer_cast(y1array->data());
1145: // run kernel:
1146: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
1148: // copy results back to
1149: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1150: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1152: // sum group results into z:
1153: for (j=0; j<2; ++j) {
1154: z[current_y_index + j] = 0;
1155: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1156: }
1157: #endif
1158: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1159: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1160: current_y_index += 2;
1161: break;
1163: case 1:
1164: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1165: #if defined(PETSC_USE_COMPLEX)
1166: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1167: #else
1168: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1169: #endif
1170: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1171: current_y_index += 1;
1172: break;
1174: default: // 8 or more vectors left
1175: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1176: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1177: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1178: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1179: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1180: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1181: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1182: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
1184: #if defined(PETSC_USE_COMPLEX)
1185: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1186: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1187: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1188: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1189: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1190: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1191: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1192: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1193: #else
1194: // extract raw device pointers:
1195: y0ptr = thrust::raw_pointer_cast(y0array->data());
1196: y1ptr = thrust::raw_pointer_cast(y1array->data());
1197: y2ptr = thrust::raw_pointer_cast(y2array->data());
1198: y3ptr = thrust::raw_pointer_cast(y3array->data());
1199: y4ptr = thrust::raw_pointer_cast(y4array->data());
1200: y5ptr = thrust::raw_pointer_cast(y5array->data());
1201: y6ptr = thrust::raw_pointer_cast(y6array->data());
1202: y7ptr = thrust::raw_pointer_cast(y7array->data());
1204: // run kernel:
1205: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1207: // copy results back to
1208: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1209: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1211: // sum group results into z:
1212: for (j=0; j<8; ++j) {
1213: z[current_y_index + j] = 0;
1214: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1215: }
1216: #endif
1217: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1218: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1219: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1220: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1221: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1222: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1223: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1224: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1225: current_y_index += 8;
1226: break;
1227: }
1228: }
1229: VecCUSPRestoreArrayRead(xin,&xarray);
1231: cuda_cudaFree(group_results_gpu);
1232: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1233: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1234: return(0);
1235: }
1237: #undef MDOT_WORKGROUP_SIZE
1238: #undef MDOT_WORKGROUP_NUM
1244: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1245: {
1246: CUSPARRAY *xarray=NULL;
1250: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1251: VecCUSPGetArrayWrite(xin,&xarray);
1252: try {
1253: cusp::blas::fill(*xarray,alpha);
1254: } catch(char *ex) {
1255: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1256: }
1257: WaitForGPU();CHKERRCUSP(ierr);
1258: VecCUSPRestoreArrayWrite(xin,&xarray);
1259: return(0);
1260: }
1264: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1265: {
1266: CUSPARRAY *xarray;
1270: if (alpha == 0.0) {
1271: VecSet_SeqCUSP(xin,alpha);
1272: } else if (alpha != 1.0) {
1273: VecCUSPGetArrayReadWrite(xin,&xarray);
1274: try {
1275: cusp::blas::scal(*xarray,alpha);
1276: } catch(char *ex) {
1277: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1278: }
1279: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1280: }
1281: WaitForGPU();CHKERRCUSP(ierr);
1282: PetscLogFlops(xin->map->n);
1283: return(0);
1284: }
1289: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1290: {
1291: CUSPARRAY *xarray,*yarray;
1295: //#if defined(PETSC_USE_COMPLEX)
1296: /*Not working for complex*/
1297: //#else
1298: VecCUSPGetArrayRead(xin,&xarray);
1299: VecCUSPGetArrayRead(yin,&yarray);
1300: try {
1301: *z = cusp::blas::dot(*xarray,*yarray);
1302: } catch(char *ex) {
1303: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1304: }
1305: //#endif
1306: WaitForGPU();CHKERRCUSP(ierr);
1307: if (xin->map->n > 0) {
1308: PetscLogFlops(2.0*xin->map->n-1);
1309: }
1310: VecCUSPRestoreArrayRead(yin,&yarray);
1311: VecCUSPRestoreArrayRead(xin,&xarray);
1312: return(0);
1313: }
1316: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1317: {
1318: CUSPARRAY *xarray,*yarray;
1322: if (xin != yin) {
1323: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1324: VecCUSPGetArrayRead(xin,&xarray);
1325: VecCUSPGetArrayWrite(yin,&yarray);
1326: try {
1327: cusp::blas::copy(*xarray,*yarray);
1328: } catch(char *ex) {
1329: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1330: }
1331: WaitForGPU();CHKERRCUSP(ierr);
1332: VecCUSPRestoreArrayRead(xin,&xarray);
1333: VecCUSPRestoreArrayWrite(yin,&yarray);
1335: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1336: /* copy in CPU if we are on the CPU*/
1337: VecCopy_SeqCUSP_Private(xin,yin);
1338: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1339: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1340: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1341: /* copy in CPU */
1342: VecCopy_SeqCUSP_Private(xin,yin);
1344: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1345: /* copy in GPU */
1346: VecCUSPGetArrayRead(xin,&xarray);
1347: VecCUSPGetArrayWrite(yin,&yarray);
1348: try {
1349: cusp::blas::copy(*xarray,*yarray);
1350: WaitForGPU();CHKERRCUSP(ierr);
1351: } catch(char *ex) {
1352: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1353: }
1354: VecCUSPRestoreArrayRead(xin,&xarray);
1355: VecCUSPRestoreArrayWrite(yin,&yarray);
1356: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1357: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1358: default to copy in GPU (this is an arbitrary choice) */
1359: VecCUSPGetArrayRead(xin,&xarray);
1360: VecCUSPGetArrayWrite(yin,&yarray);
1361: try {
1362: cusp::blas::copy(*xarray,*yarray);
1363: WaitForGPU();CHKERRCUSP(ierr);
1364: } catch(char *ex) {
1365: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1366: }
1367: VecCUSPRestoreArrayRead(xin,&xarray);
1368: VecCUSPRestoreArrayWrite(yin,&yarray);
1369: } else {
1370: VecCopy_SeqCUSP_Private(xin,yin);
1371: }
1372: }
1373: }
1374: return(0);
1375: }
1380: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1381: {
1383: PetscBLASInt one = 1,bn;
1384: CUSPARRAY *xarray,*yarray;
1387: PetscBLASIntCast(xin->map->n,&bn);
1388: if (xin != yin) {
1389: VecCUSPGetArrayReadWrite(xin,&xarray);
1390: VecCUSPGetArrayReadWrite(yin,&yarray);
1392: #if defined(PETSC_USE_COMPLEX)
1393: #if defined(PETSC_USE_REAL_SINGLE)
1394: cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1395: #else
1396: cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1397: #endif
1398: #else
1399: #if defined(PETSC_USE_REAL_SINGLE)
1400: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1401: #else
1402: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1403: #endif
1404: #endif
1405: cublasGetError();CHKERRCUSP(ierr);
1406: WaitForGPU();CHKERRCUSP(ierr);
1407: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1408: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1409: }
1410: return(0);
1411: }
1413: struct VecCUSPAX
1414: {
1415: template <typename Tuple>
1416: __host__ __device__
1417: void operator()(Tuple t)
1418: {
1419: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1420: }
1421: };
1424: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1425: {
1427: PetscScalar a = alpha,b = beta;
1428: CUSPARRAY *xarray,*yarray;
1431: if (a == 0.0) {
1432: VecScale_SeqCUSP(yin,beta);
1433: } else if (b == 1.0) {
1434: VecAXPY_SeqCUSP(yin,alpha,xin);
1435: } else if (a == 1.0) {
1436: VecAYPX_SeqCUSP(yin,beta,xin);
1437: } else if (b == 0.0) {
1438: VecCUSPGetArrayRead(xin,&xarray);
1439: VecCUSPGetArrayReadWrite(yin,&yarray);
1440: try {
1441: thrust::for_each(
1442: thrust::make_zip_iterator(
1443: thrust::make_tuple(
1444: yarray->begin(),
1445: thrust::make_constant_iterator(a),
1446: xarray->begin())),
1447: thrust::make_zip_iterator(
1448: thrust::make_tuple(
1449: yarray->end(),
1450: thrust::make_constant_iterator(a),
1451: xarray->end())),
1452: VecCUSPAX());
1453: } catch(char *ex) {
1454: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1455: }
1456: PetscLogFlops(xin->map->n);
1457: WaitForGPU();CHKERRCUSP(ierr);
1458: VecCUSPRestoreArrayRead(xin,&xarray);
1459: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1460: } else {
1461: VecCUSPGetArrayRead(xin,&xarray);
1462: VecCUSPGetArrayReadWrite(yin,&yarray);
1463: try {
1464: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1465: } catch(char *ex) {
1466: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1467: }
1468: VecCUSPRestoreArrayRead(xin,&xarray);
1469: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1470: WaitForGPU();CHKERRCUSP(ierr);
1471: PetscLogFlops(3.0*xin->map->n);
1472: }
1473: return(0);
1474: }
1476: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1477: struct VecCUSPXPBYPCZ
1478: {
1479: /* z = x + b*y + c*z */
1480: template <typename Tuple>
1481: __host__ __device__
1482: void operator()(Tuple t)
1483: {
1484: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1485: }
1486: };
1487: struct VecCUSPAXPBYPZ
1488: {
1489: /* z = ax + b*y + z */
1490: template <typename Tuple>
1491: __host__ __device__
1492: void operator()(Tuple t)
1493: {
1494: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1495: }
1496: };
1500: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1501: {
1503: PetscInt n = zin->map->n;
1504: CUSPARRAY *xarray,*yarray,*zarray;
1507: VecCUSPGetArrayRead(xin,&xarray);
1508: VecCUSPGetArrayRead(yin,&yarray);
1509: VecCUSPGetArrayReadWrite(zin,&zarray);
1510: if (alpha == 1.0) {
1511: try {
1512: thrust::for_each(
1513: thrust::make_zip_iterator(
1514: thrust::make_tuple(
1515: zarray->begin(),
1516: thrust::make_constant_iterator(gamma),
1517: xarray->begin(),
1518: yarray->begin(),
1519: thrust::make_constant_iterator(beta))),
1520: thrust::make_zip_iterator(
1521: thrust::make_tuple(
1522: zarray->end(),
1523: thrust::make_constant_iterator(gamma),
1524: xarray->end(),
1525: yarray->end(),
1526: thrust::make_constant_iterator(beta))),
1527: VecCUSPXPBYPCZ());
1528: } catch(char *ex) {
1529: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1530: }
1531: PetscLogFlops(4.0*n);
1532: } else if (gamma == 1.0) {
1533: try {
1534: thrust::for_each(
1535: thrust::make_zip_iterator(
1536: thrust::make_tuple(
1537: zarray->begin(),
1538: xarray->begin(),
1539: thrust::make_constant_iterator(alpha),
1540: yarray->begin(),
1541: thrust::make_constant_iterator(beta))),
1542: thrust::make_zip_iterator(
1543: thrust::make_tuple(
1544: zarray->end(),
1545: xarray->end(),
1546: thrust::make_constant_iterator(alpha),
1547: yarray->end(),
1548: thrust::make_constant_iterator(beta))),
1549: VecCUSPAXPBYPZ());
1550: } catch(char *ex) {
1551: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1552: }
1553: PetscLogFlops(4.0*n);
1554: } else {
1555: try {
1556: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1557: } catch(char *ex) {
1558: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1559: }
1560: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1561: VecCUSPRestoreArrayRead(xin,&xarray);
1562: VecCUSPRestoreArrayRead(yin,&yarray);
1563: PetscLogFlops(5.0*n);
1564: }
1565: WaitForGPU();CHKERRCUSP(ierr);
1566: return(0);
1567: }
1571: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1572: {
1574: PetscInt n = win->map->n;
1575: CUSPARRAY *xarray,*yarray,*warray;
1578: VecCUSPGetArrayRead(xin,&xarray);
1579: VecCUSPGetArrayRead(yin,&yarray);
1580: VecCUSPGetArrayReadWrite(win,&warray);
1581: try {
1582: cusp::blas::xmy(*xarray,*yarray,*warray);
1583: } catch(char *ex) {
1584: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1585: }
1586: VecCUSPRestoreArrayRead(xin,&xarray);
1587: VecCUSPRestoreArrayRead(yin,&yarray);
1588: VecCUSPRestoreArrayReadWrite(win,&warray);
1589: PetscLogFlops(n);
1590: WaitForGPU();CHKERRCUSP(ierr);
1591: return(0);
1592: }
1595: /* should do infinity norm in cusp */
1599: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1600: {
1601: const PetscScalar *xx;
1602: PetscErrorCode ierr;
1603: PetscInt n = xin->map->n;
1604: PetscBLASInt one = 1, bn;
1605: CUSPARRAY *xarray;
1608: PetscBLASIntCast(n,&bn);
1609: if (type == NORM_2 || type == NORM_FROBENIUS) {
1610: VecCUSPGetArrayRead(xin,&xarray);
1611: try {
1612: *z = cusp::blas::nrm2(*xarray);
1613: } catch(char *ex) {
1614: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1615: }
1616: WaitForGPU();CHKERRCUSP(ierr);
1617: VecCUSPRestoreArrayRead(xin,&xarray);
1618: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1619: } else if (type == NORM_INFINITY) {
1620: PetscInt i;
1621: PetscReal max = 0.0,tmp;
1623: VecGetArrayRead(xin,&xx);
1624: for (i=0; i<n; i++) {
1625: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1626: /* check special case of tmp == NaN */
1627: if (tmp != tmp) {max = tmp; break;}
1628: xx++;
1629: }
1630: VecRestoreArrayRead(xin,&xx);
1631: *z = max;
1632: } else if (type == NORM_1) {
1633: VecCUSPGetArrayRead(xin,&xarray);
1634: #if defined(PETSC_USE_COMPLEX)
1635: #if defined(PETSC_USE_REAL_SINGLE)
1636: *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1637: #else
1638: *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1639: #endif
1640: #else
1641: #if defined(PETSC_USE_REAL_SINGLE)
1642: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1643: #else
1644: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1645: #endif
1646: #endif
1647: cublasGetError();CHKERRCUSP(ierr);
1648: VecCUSPRestoreArrayRead(xin,&xarray);
1649: WaitForGPU();CHKERRCUSP(ierr);
1650: PetscLogFlops(PetscMax(n-1.0,0.0));
1651: } else if (type == NORM_1_AND_2) {
1652: VecNorm_SeqCUSP(xin,NORM_1,z);
1653: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1654: }
1655: return(0);
1656: }
1659: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1663: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1664: {
1668: VecSetRandom_SeqCUSP_Private(xin,r);
1669: xin->valid_GPU_array = PETSC_CUSP_CPU;
1670: return(0);
1671: }
1675: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1676: {
1680: VecCUSPCopyFromGPU(vin);
1681: VecResetArray_SeqCUSP_Private(vin);
1682: vin->valid_GPU_array = PETSC_CUSP_CPU;
1683: return(0);
1684: }
1688: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1689: {
1693: VecCUSPCopyFromGPU(vin);
1694: VecPlaceArray_Seq(vin,a);
1695: vin->valid_GPU_array = PETSC_CUSP_CPU;
1696: return(0);
1697: }
1702: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1703: {
1707: VecCUSPCopyFromGPU(vin);
1708: VecReplaceArray_Seq(vin,a);
1709: vin->valid_GPU_array = PETSC_CUSP_CPU;
1710: return(0);
1711: }
1716: /*@
1717: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1719: Collective on MPI_Comm
1721: Input Parameter:
1722: + comm - the communicator, should be PETSC_COMM_SELF
1723: - n - the vector length
1725: Output Parameter:
1726: . V - the vector
1728: Notes:
1729: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1730: same type as an existing vector.
1732: Level: intermediate
1734: Concepts: vectors^creating sequential
1736: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1737: @*/
1738: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1739: {
1743: VecCreate(comm,v);
1744: VecSetSizes(*v,n,n);
1745: VecSetType(*v,VECSEQCUSP);
1746: return(0);
1747: }
1749: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1750: template <typename T>
1751: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1752: {
1753: __host__ __device__
1754: T operator()(T x)
1755: {
1756: #if defined(PETSC_USE_COMPLEX)
1757: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1758: #else
1759: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1760: #endif
1761: }
1762: };
1764: template <typename T>
1765: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1766: {
1767: __host__ __device__
1768: T operator()(T x,T y)
1769: {
1770: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1771: }
1772: };
1776: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1777: {
1778: PetscErrorCode ierr;
1779: PetscScalar zero = 0.0;
1780: PetscReal n=s->map->n;
1781: thrust::tuple<PetscScalar,PetscScalar> result;
1782: CUSPARRAY *sarray,*tarray;
1785: /*VecCUSPCopyToGPU(s);
1786: VecCUSPCopyToGPU(t);*/
1787: VecCUSPGetArrayRead(s,&sarray);
1788: VecCUSPGetArrayRead(t,&tarray);
1789: try {
1790: #if defined(PETSC_USE_COMPLEX)
1791: VecDot_SeqCUSP(s,t,dp);
1792: VecDot_SeqCUSP(t,t,nm);
1793: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1794: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1795: #else
1796: result = thrust::transform_reduce(
1797: thrust::make_zip_iterator(
1798: thrust::make_tuple(
1799: sarray->begin(),
1800: tarray->begin())),
1801: thrust::make_zip_iterator(
1802: thrust::make_tuple(
1803: sarray->end(),
1804: tarray->end())),
1805: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1806: thrust::make_tuple(zero,zero), /*init */
1807: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1808: *dp = thrust::get<0>(result);
1809: *nm = thrust::get<1>(result);
1810: #endif
1811: } catch(char *ex) {
1812: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1813: }
1814: VecCUSPRestoreArrayRead(s,&sarray);
1815: VecCUSPRestoreArrayRead(t,&tarray);
1816: WaitForGPU();CHKERRCUSP(ierr);
1817: PetscLogFlops(4.0*n);
1818: return(0);
1819: }
1823: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1824: {
1828: VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1829: PetscLayoutReference(win->map,&(*V)->map);
1830: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1831: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1832: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1833: return(0);
1834: }
1838: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1839: {
1841: Vec_Seq *s = (Vec_Seq*)v->data;
1842: cudaError_t err;
1844: try {
1845: if (v->spptr) {
1846: delete ((Vec_CUSP*)v->spptr)->GPUarray;
1847: err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1849: /* If the host array has been registered as (page-locked) mapped,
1850: one must unregister the buffer */
1851: if (((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked) {
1852: err = cudaHostUnregister(s->array);CHKERRCUSP(err);
1853: }
1854: delete (Vec_CUSP*) v->spptr;
1855: }
1856: } catch(char *ex) {
1857: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1858: }
1859: VecDestroy_SeqCUSP_Private(v);
1860: return(0);
1861: }
1864: #if defined(PETSC_USE_COMPLEX)
1865: struct conjugate
1866: {
1867: __host__ __device__
1868: PetscScalar operator()(PetscScalar x)
1869: {
1870: return cusp::conj(x);
1871: }
1872: };
1873: #endif
1878: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1879: {
1881: CUSPARRAY *xarray;
1884: VecCUSPGetArrayReadWrite(xin,&xarray);
1885: #if defined(PETSC_USE_COMPLEX)
1886: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1887: #endif
1888: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1889: return(0);
1890: }
1894: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1895: {
1897: PetscMPIInt size;
1900: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1901: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1902: VecCreate_Seq_Private(V,0);
1903: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1905: V->ops->dot = VecDot_SeqCUSP;
1906: V->ops->norm = VecNorm_SeqCUSP;
1907: V->ops->tdot = VecTDot_SeqCUSP;
1908: V->ops->scale = VecScale_SeqCUSP;
1909: V->ops->copy = VecCopy_SeqCUSP;
1910: V->ops->set = VecSet_SeqCUSP;
1911: V->ops->swap = VecSwap_SeqCUSP;
1912: V->ops->axpy = VecAXPY_SeqCUSP;
1913: V->ops->axpby = VecAXPBY_SeqCUSP;
1914: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
1915: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
1916: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1917: V->ops->setrandom = VecSetRandom_SeqCUSP;
1918: V->ops->dot_local = VecDot_SeqCUSP;
1919: V->ops->tdot_local = VecTDot_SeqCUSP;
1920: V->ops->norm_local = VecNorm_SeqCUSP;
1921: V->ops->mdot_local = VecMDot_SeqCUSP;
1922: V->ops->maxpy = VecMAXPY_SeqCUSP;
1923: V->ops->mdot = VecMDot_SeqCUSP;
1924: V->ops->aypx = VecAYPX_SeqCUSP;
1925: V->ops->waxpy = VecWAXPY_SeqCUSP;
1926: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
1927: V->ops->placearray = VecPlaceArray_SeqCUSP;
1928: V->ops->replacearray = VecReplaceArray_SeqCUSP;
1929: V->ops->resetarray = VecResetArray_SeqCUSP;
1930: V->ops->destroy = VecDestroy_SeqCUSP;
1931: V->ops->duplicate = VecDuplicate_SeqCUSP;
1932: V->ops->conjugate = VecConjugate_SeqCUSP;
1934: VecCUSPAllocateCheck(V);
1935: V->valid_GPU_array = PETSC_CUSP_GPU;
1936: VecSet(V,0.0);
1937: return(0);
1938: }
1942: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1943: {
1947: *a = 0;
1948: VecCUSPCopyToGPU(v);
1949: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1950: return(0);
1951: }
1955: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1956: {
1960: v->valid_GPU_array = PETSC_CUSP_GPU;
1962: PetscObjectStateIncrease((PetscObject)v);
1963: return(0);
1964: }
1968: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1969: {
1973: *a = 0;
1974: VecCUSPCopyToGPU(v);
1975: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1976: return(0);
1977: }
1981: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1982: {
1984: return(0);
1985: }
1989: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1990: {
1994: *a = 0;
1995: VecCUSPAllocateCheck(v);
1996: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1997: return(0);
1998: }
2002: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2003: {
2007: v->valid_GPU_array = PETSC_CUSP_GPU;
2009: PetscObjectStateIncrease((PetscObject)v);
2010: return(0);
2011: }