Actual source code: veccusp.cu
petsc-dev 2014-02-02
1: /*
2: Implements the sequential cusp vectors.
3: */
5: #include <petscconf.h>
6: PETSC_CUDA_EXTERN_C_BEGIN
7: #include <petsc-private/vecimpl.h> /*I "petscvec.h" I*/
8: #include <../src/vec/vec/impls/dvecimpl.h>
9: PETSC_CUDA_EXTERN_C_END
10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
12: #include <cuda_runtime.h>
16: /*
17: Allocates space for the vector array on the Host if it does not exist.
18: Does NOT change the PetscCUSPFlag for the vector
19: Does NOT zero the CUSP array
20: */
21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
22: {
24: cudaError_t err;
25: PetscScalar *array;
26: Vec_Seq *s;
27: PetscInt n = v->map->n;
30: s = (Vec_Seq*)v->data;
31: VecCUSPAllocateCheck(v);
32: if (s->array == 0) {
33: PetscMalloc1(n,&array);
34: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
35: s->array = array;
36: s->array_allocated = array;
37: err = cudaHostRegister(s->array, n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
38: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
39: }
40: return(0);
41: }
46: /*
47: Allocates space for the vector array on the GPU if it does not exist.
48: Does NOT change the PetscCUSPFlag for the vector
49: Does NOT zero the CUSP array
51: */
52: PetscErrorCode VecCUSPAllocateCheck(Vec v)
53: {
54: cudaError_t err;
55: cudaStream_t stream;
56: Vec_Seq *s = (Vec_Seq*)v->data;
59: // First allocate memory on the GPU if needed
60: if (!v->spptr) {
61: try {
62: v->spptr = new Vec_CUSP;
63: ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
64: ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
65: err = cudaStreamCreate(&stream);CHKERRCUSP(err);
66: ((Vec_CUSP*)v->spptr)->stream = stream;
68: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_FALSE;
69: /* If the array is already allocated, one can register it as (page-locked) mapped.
70: This can substantially accelerate data transfer across the PCI Express */
71: if (s->array) {
72: err = cudaHostRegister(s->array, v->map->n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
73: ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
74: }
75: v->ops->destroy = VecDestroy_SeqCUSP;
76: } catch(char *ex) {
77: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
78: }
79: }
80: return(0);
81: }
86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
87: PetscErrorCode VecCUSPCopyToGPU(Vec v)
88: {
90: cudaError_t err;
91: Vec_CUSP *veccusp;
92: CUSPARRAY *varray;
93: cudaStream_t stream;
96: VecCUSPAllocateCheck(v);
97: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
98: PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
99: try {
100: veccusp=(Vec_CUSP*)v->spptr;
101: varray=veccusp->GPUarray;
102: stream=veccusp->stream;
103: err = cudaMemcpyAsync(varray->data().get(), *(PetscScalar**)v->data, v->map->n*sizeof(PetscScalar),
104: cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
105: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
106: } catch(char *ex) {
107: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
108: }
109: PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
110: v->valid_GPU_array = PETSC_CUSP_BOTH;
111: }
112: return(0);
113: }
117: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
118: {
119: CUSPARRAY *varray;
121: cudaError_t err;
122: PetscScalar *cpuPtr, *gpuPtr;
123: cudaStream_t stream;
124: Vec_Seq *s;
125: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
128: VecCUSPAllocateCheck(v);
129: if (v->valid_GPU_array == PETSC_CUSP_CPU) {
130: stream=((Vec_CUSP*)v->spptr)->stream;
131: s = (Vec_Seq*)v->data;
133: PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
134: varray = ((Vec_CUSP*)v->spptr)->GPUarray;
135: gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
136: cpuPtr = s->array + ptop_scatter->recvLowestIndex;
138: /* Note : this code copies the smallest contiguous chunk of data
139: containing ALL of the indices */
140: err = cudaMemcpyAsync(gpuPtr, cpuPtr, ptop_scatter->nr*sizeof(PetscScalar),
141: cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
142: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
144: #if 0
145: Vec_Seq *s;
146: s = (Vec_Seq*)v->data;
148: CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
149: CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;
151: thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
152: thrust::make_permutation_iterator(s->array,indicesCPU->end()),
153: thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
154: #endif
155: // Set the buffer states
156: v->valid_GPU_array = PETSC_CUSP_BOTH;
157: PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
158: }
159: return(0);
160: }
165: /*
166: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
167: */
168: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
169: {
171: cudaError_t err;
172: Vec_CUSP *veccusp;
173: CUSPARRAY *varray;
174: cudaStream_t stream;
177: VecCUSPAllocateCheckHost(v);
178: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
179: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
180: try {
181: veccusp=(Vec_CUSP*)v->spptr;
182: varray=veccusp->GPUarray;
183: stream=veccusp->stream;
185: err = cudaMemcpyAsync(*(PetscScalar**)v->data, varray->data().get(), v->map->n*sizeof(PetscScalar),
186: cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
187: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
188: } catch(char *ex) {
189: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190: }
191: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192: v->valid_GPU_array = PETSC_CUSP_BOTH;
193: }
194: return(0);
195: }
199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200: which means that we need recombine the data at some point before using any of the standard functions.
201: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202: where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206: CUSPARRAY *varray;
208: cudaError_t err;
209: PetscScalar *cpuPtr, *gpuPtr;
210: cudaStream_t stream;
211: Vec_Seq *s;
212: VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;
215: VecCUSPAllocateCheckHost(v);
216: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
217: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
219: stream=((Vec_CUSP*)v->spptr)->stream;
220: varray=((Vec_CUSP*)v->spptr)->GPUarray;
221: s = (Vec_Seq*)v->data;
222: gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
223: cpuPtr = s->array + ptop_scatter->sendLowestIndex;
225: /* Note : this code copies the smallest contiguous chunk of data
226: containing ALL of the indices */
227: err = cudaMemcpyAsync(cpuPtr, gpuPtr, ptop_scatter->ns*sizeof(PetscScalar),
228: cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
229: err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
231: #if 0
232: Vec_Seq *s;
233: s = (Vec_Seq*)v->data;
234: CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
235: CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;
237: thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
238: thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
239: thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
240: #endif
241: VecCUSPRestoreArrayRead(v,&varray);
242: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
243: v->valid_GPU_array = PETSC_CUSP_BOTH;
244: }
245: return(0);
246: }
250: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
251: {
252: PetscScalar *ya;
253: const PetscScalar *xa;
254: PetscErrorCode ierr;
257: VecCUSPAllocateCheckHost(xin);
258: VecCUSPAllocateCheckHost(yin);
259: if (xin != yin) {
260: VecGetArrayRead(xin,&xa);
261: VecGetArray(yin,&ya);
262: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
263: VecRestoreArrayRead(xin,&xa);
264: VecRestoreArray(yin,&ya);
265: }
266: return(0);
267: }
271: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
272: {
274: PetscInt n = xin->map->n,i;
275: PetscScalar *xx;
278: VecGetArray(xin,&xx);
279: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
280: VecRestoreArray(xin,&xx);
281: return(0);
282: }
286: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
287: {
288: Vec_Seq *vs = (Vec_Seq*)v->data;
292: PetscObjectSAWsViewOff(v);
293: #if defined(PETSC_USE_LOG)
294: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
295: #endif
296: if (vs->array_allocated) PetscFree(vs->array_allocated);
297: PetscFree(vs);
298: return(0);
299: }
303: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
304: {
305: Vec_Seq *v = (Vec_Seq*)vin->data;
308: v->array = v->unplacedarray;
309: v->unplacedarray = 0;
310: return(0);
311: }
313: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
316: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
317: {
321: VecCUSPAllocateCheck(v);
322: return(0);
323: }
327: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
328: {
332: VecCUSPCopyToGPU(v);
333: return(0);
334: }
340: /*
341: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
343: Input Parameters:
344: + v - the vector
345: - indices - the requested indices, this should be created with CUSPIndicesCreate()
347: */
348: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
349: {
353: VecCUSPCopyToGPUSome(v,ci);
354: return(0);
355: }
359: /*
360: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
362: Input Parameters:
363: + v - the vector
364: - indices - the requested indices, this should be created with CUSPIndicesCreate()
365: */
366: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
367: {
371: VecCUSPCopyFromGPUSome(v,ci);
372: return(0);
373: }
375: /*MC
376: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
378: Options Database Keys:
379: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
381: Level: beginner
383: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
384: M*/
386: /* for VecAYPX_SeqCUSP*/
387: namespace cusp
388: {
389: namespace blas
390: {
391: namespace detail
392: {
393: template <typename T>
394: struct AYPX : public thrust::binary_function<T,T,T>
395: {
396: T alpha;
398: AYPX(T _alpha) : alpha(_alpha) {}
400: __host__ __device__
401: T operator()(T x, T y)
402: {
403: return alpha * y + x;
404: }
405: };
406: }
408: template <typename ForwardIterator1,
409: typename ForwardIterator2,
410: typename ScalarType>
411: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
412: {
413: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
414: }
415: template <typename Array1, typename Array2, typename ScalarType>
416: void aypx(const Array1& x, Array2& y, ScalarType alpha)
417: {
418: detail::assert_same_dimensions(x,y);
419: aypx(x.begin(),x.end(),y.begin(),alpha);
420: }
421: }
422: }
426: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
427: {
428: CUSPARRAY *xarray,*yarray;
432: if (alpha != 0.0) {
433: VecCUSPGetArrayRead(xin,&xarray);
434: VecCUSPGetArrayReadWrite(yin,&yarray);
435: try {
436: cusp::blas::aypx(*xarray,*yarray,alpha);
437: WaitForGPU();CHKERRCUSP(ierr);
438: } catch(char *ex) {
439: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
440: }
441: VecCUSPRestoreArrayRead(xin,&xarray);
442: VecCUSPRestoreArrayReadWrite(yin,&yarray);
443: PetscLogFlops(2.0*yin->map->n);
444: }
445: return(0);
446: }
451: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
452: {
453: CUSPARRAY *xarray,*yarray;
457: if (alpha != 0.0) {
458: VecCUSPGetArrayRead(xin,&xarray);
459: VecCUSPGetArrayReadWrite(yin,&yarray);
460: try {
461: cusp::blas::axpy(*xarray,*yarray,alpha);
462: WaitForGPU();CHKERRCUSP(ierr);
463: } catch(char *ex) {
464: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
465: }
466: VecCUSPRestoreArrayRead(xin,&xarray);
467: VecCUSPRestoreArrayReadWrite(yin,&yarray);
468: PetscLogFlops(2.0*yin->map->n);
469: }
470: return(0);
471: }
473: struct VecCUSPPointwiseDivide
474: {
475: template <typename Tuple>
476: __host__ __device__
477: void operator()(Tuple t)
478: {
479: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
480: }
481: };
485: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
486: {
487: CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL;
491: VecCUSPGetArrayRead(xin,&xarray);
492: VecCUSPGetArrayRead(yin,&yarray);
493: VecCUSPGetArrayWrite(win,&warray);
494: try {
495: thrust::for_each(
496: thrust::make_zip_iterator(
497: thrust::make_tuple(
498: warray->begin(),
499: xarray->begin(),
500: yarray->begin())),
501: thrust::make_zip_iterator(
502: thrust::make_tuple(
503: warray->end(),
504: xarray->end(),
505: yarray->end())),
506: VecCUSPPointwiseDivide());
507: WaitForGPU();CHKERRCUSP(ierr);
508: } catch(char *ex) {
509: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
510: }
511: PetscLogFlops(win->map->n);
512: VecCUSPRestoreArrayRead(xin,&xarray);
513: VecCUSPRestoreArrayRead(yin,&yarray);
514: VecCUSPRestoreArrayWrite(win,&warray);
515: return(0);
516: }
519: struct VecCUSPWAXPY
520: {
521: template <typename Tuple>
522: __host__ __device__
523: void operator()(Tuple t)
524: {
525: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
526: }
527: };
529: struct VecCUSPSum
530: {
531: template <typename Tuple>
532: __host__ __device__
533: void operator()(Tuple t)
534: {
535: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
536: }
537: };
539: struct VecCUSPDiff
540: {
541: template <typename Tuple>
542: __host__ __device__
543: void operator()(Tuple t)
544: {
545: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
546: }
547: };
551: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
552: {
553: CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL;
557: if (alpha == 0.0) {
558: VecCopy_SeqCUSP(yin,win);
559: } else {
560: VecCUSPGetArrayRead(xin,&xarray);
561: VecCUSPGetArrayRead(yin,&yarray);
562: VecCUSPGetArrayWrite(win,&warray);
563: if (alpha == 1.0) {
564: try {
565: thrust::for_each(
566: thrust::make_zip_iterator(
567: thrust::make_tuple(
568: warray->begin(),
569: yarray->begin(),
570: xarray->begin())),
571: thrust::make_zip_iterator(
572: thrust::make_tuple(
573: warray->end(),
574: yarray->end(),
575: xarray->end())),
576: VecCUSPSum());
577: } catch(char *ex) {
578: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
579: }
580: PetscLogFlops(win->map->n);
581: } else if (alpha == -1.0) {
582: try {
583: thrust::for_each(
584: thrust::make_zip_iterator(
585: thrust::make_tuple(
586: warray->begin(),
587: yarray->begin(),
588: xarray->begin())),
589: thrust::make_zip_iterator(
590: thrust::make_tuple(
591: warray->end(),
592: yarray->end(),
593: xarray->end())),
594: VecCUSPDiff());
595: } catch(char *ex) {
596: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
597: }
598: PetscLogFlops(win->map->n);
599: } else {
600: try {
601: thrust::for_each(
602: thrust::make_zip_iterator(
603: thrust::make_tuple(
604: warray->begin(),
605: yarray->begin(),
606: thrust::make_constant_iterator(alpha),
607: xarray->begin())),
608: thrust::make_zip_iterator(
609: thrust::make_tuple(
610: warray->end(),
611: yarray->end(),
612: thrust::make_constant_iterator(alpha),
613: xarray->end())),
614: VecCUSPWAXPY());
615: } catch(char *ex) {
616: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
617: }
618: PetscLogFlops(2*win->map->n);
619: }
620: WaitForGPU();CHKERRCUSP(ierr);
621: VecCUSPRestoreArrayRead(xin,&xarray);
622: VecCUSPRestoreArrayRead(yin,&yarray);
623: VecCUSPRestoreArrayWrite(win,&warray);
624: }
625: return(0);
626: }
628: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
629: struct VecCUSPMAXPY4
630: {
631: template <typename Tuple>
632: __host__ __device__
633: void operator()(Tuple t)
634: {
635: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
636: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
637: }
638: };
641: struct VecCUSPMAXPY3
642: {
643: template <typename Tuple>
644: __host__ __device__
645: void operator()(Tuple t)
646: {
647: /*y += a1*x1 +a2*x2 + a3*x3 */
648: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
649: }
650: };
652: struct VecCUSPMAXPY2
653: {
654: template <typename Tuple>
655: __host__ __device__
656: void operator()(Tuple t)
657: {
658: /*y += a1*x1 +a2*x2*/
659: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
660: }
661: };
664: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
665: {
667: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
668: PetscInt n = xin->map->n,j,j_rem;
669: PetscScalar alpha0,alpha1,alpha2,alpha3;
672: PetscLogFlops(nv*2.0*n);
673: VecCUSPGetArrayReadWrite(xin,&xarray);
674: switch (j_rem=nv&0x3) {
675: case 3:
676: alpha0 = alpha[0];
677: alpha1 = alpha[1];
678: alpha2 = alpha[2];
679: alpha += 3;
680: VecCUSPGetArrayRead(y[0],&yy0);
681: VecCUSPGetArrayRead(y[1],&yy1);
682: VecCUSPGetArrayRead(y[2],&yy2);
683: try {
684: thrust::for_each(
685: thrust::make_zip_iterator(
686: thrust::make_tuple(
687: xarray->begin(),
688: thrust::make_constant_iterator(alpha0),
689: yy0->begin(),
690: thrust::make_constant_iterator(alpha1),
691: yy1->begin(),
692: thrust::make_constant_iterator(alpha2),
693: yy2->begin())),
694: thrust::make_zip_iterator(
695: thrust::make_tuple(
696: xarray->end(),
697: thrust::make_constant_iterator(alpha0),
698: yy0->end(),
699: thrust::make_constant_iterator(alpha1),
700: yy1->end(),
701: thrust::make_constant_iterator(alpha2),
702: yy2->end())),
703: VecCUSPMAXPY3());
704: } catch(char *ex) {
705: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
706: }
707: VecCUSPRestoreArrayRead(y[0],&yy0);
708: VecCUSPRestoreArrayRead(y[1],&yy1);
709: VecCUSPRestoreArrayRead(y[2],&yy2);
710: y += 3;
711: break;
712: case 2:
713: alpha0 = alpha[0];
714: alpha1 = alpha[1];
715: alpha +=2;
716: VecCUSPGetArrayRead(y[0],&yy0);
717: VecCUSPGetArrayRead(y[1],&yy1);
718: try {
719: thrust::for_each(
720: thrust::make_zip_iterator(
721: thrust::make_tuple(
722: xarray->begin(),
723: thrust::make_constant_iterator(alpha0),
724: yy0->begin(),
725: thrust::make_constant_iterator(alpha1),
726: yy1->begin())),
727: thrust::make_zip_iterator(
728: thrust::make_tuple(
729: xarray->end(),
730: thrust::make_constant_iterator(alpha0),
731: yy0->end(),
732: thrust::make_constant_iterator(alpha1),
733: yy1->end())),
734: VecCUSPMAXPY2());
735: } catch(char *ex) {
736: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
737: }
738: y +=2;
739: break;
740: case 1:
741: alpha0 = *alpha++;
742: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
743: y +=1;
744: break;
745: }
746: for (j=j_rem; j<nv; j+=4) {
747: alpha0 = alpha[0];
748: alpha1 = alpha[1];
749: alpha2 = alpha[2];
750: alpha3 = alpha[3];
751: alpha += 4;
752: VecCUSPGetArrayRead(y[0],&yy0);
753: VecCUSPGetArrayRead(y[1],&yy1);
754: VecCUSPGetArrayRead(y[2],&yy2);
755: VecCUSPGetArrayRead(y[3],&yy3);
756: try {
757: thrust::for_each(
758: thrust::make_zip_iterator(
759: thrust::make_tuple(
760: xarray->begin(),
761: thrust::make_constant_iterator(alpha0),
762: yy0->begin(),
763: thrust::make_constant_iterator(alpha1),
764: yy1->begin(),
765: thrust::make_constant_iterator(alpha2),
766: yy2->begin(),
767: thrust::make_constant_iterator(alpha3),
768: yy3->begin())),
769: thrust::make_zip_iterator(
770: thrust::make_tuple(
771: xarray->end(),
772: thrust::make_constant_iterator(alpha0),
773: yy0->end(),
774: thrust::make_constant_iterator(alpha1),
775: yy1->end(),
776: thrust::make_constant_iterator(alpha2),
777: yy2->end(),
778: thrust::make_constant_iterator(alpha3),
779: yy3->end())),
780: VecCUSPMAXPY4());
781: } catch(char *ex) {
782: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
783: }
784: VecCUSPRestoreArrayRead(y[0],&yy0);
785: VecCUSPRestoreArrayRead(y[1],&yy1);
786: VecCUSPRestoreArrayRead(y[2],&yy2);
787: VecCUSPRestoreArrayRead(y[3],&yy3);
788: y += 4;
789: }
790: VecCUSPRestoreArrayReadWrite(xin,&xarray);
791: WaitForGPU();CHKERRCUSP(ierr);
792: return(0);
793: }
798: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
799: {
800: CUSPARRAY *xarray,*yarray;
802: // PetscScalar *xptr,*yptr,*zgpu;
803: //PetscReal tmp;
806: //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
807: //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
808: VecCUSPGetArrayRead(xin,&xarray);
809: VecCUSPGetArrayRead(yin,&yarray);
810: try {
811: #if defined(PETSC_USE_COMPLEX)
812: *z = cusp::blas::dotc(*yarray,*xarray);
813: #else
814: *z = cusp::blas::dot(*yarray,*xarray);
815: #endif
816: } catch(char *ex) {
817: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
818: }
819: WaitForGPU();CHKERRCUSP(ierr);
820: if (xin->map->n >0) {
821: PetscLogFlops(2.0*xin->map->n-1);
822: }
823: VecCUSPRestoreArrayRead(xin,&xarray);
824: VecCUSPRestoreArrayRead(yin,&yarray);
825: return(0);
826: }
828: //
829: // CUDA kernels for MDot to follow
830: //
832: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
833: #define MDOT_WORKGROUP_SIZE 128
834: #define MDOT_WORKGROUP_NUM 128
836: // M = 2:
837: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
838: PetscInt size, PetscScalar *group_results)
839: {
840: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
841: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
842: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
843: PetscInt vec_start_index = blockIdx.x * entries_per_group;
844: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
846: PetscScalar entry_x = 0;
847: PetscScalar group_sum0 = 0;
848: PetscScalar group_sum1 = 0;
849: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
850: entry_x = x[i]; // load only once from global memory!
851: group_sum0 += entry_x * y0[i];
852: group_sum1 += entry_x * y1[i];
853: }
854: tmp_buffer[threadIdx.x] = group_sum0;
855: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
857: // parallel reduction
858: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
859: __syncthreads();
860: if (threadIdx.x < stride) {
861: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
862: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
863: }
864: }
866: // write result of group to group_results
867: if (threadIdx.x == 0) {
868: group_results[blockIdx.x] = tmp_buffer[0];
869: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
870: }
871: }
873: // M = 3:
874: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
875: PetscInt size, PetscScalar *group_results)
876: {
877: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
878: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
879: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
880: PetscInt vec_start_index = blockIdx.x * entries_per_group;
881: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
883: PetscScalar entry_x = 0;
884: PetscScalar group_sum0 = 0;
885: PetscScalar group_sum1 = 0;
886: PetscScalar group_sum2 = 0;
887: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
888: entry_x = x[i]; // load only once from global memory!
889: group_sum0 += entry_x * y0[i];
890: group_sum1 += entry_x * y1[i];
891: group_sum2 += entry_x * y2[i];
892: }
893: tmp_buffer[threadIdx.x] = group_sum0;
894: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
895: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
897: // parallel reduction
898: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
899: __syncthreads();
900: if (threadIdx.x < stride) {
901: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
902: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
903: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
904: }
905: }
907: // write result of group to group_results
908: if (threadIdx.x == 0) {
909: group_results[blockIdx.x ] = tmp_buffer[0];
910: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
911: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
912: }
913: }
915: // M = 4:
916: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
917: PetscInt size, PetscScalar *group_results)
918: {
919: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
920: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
921: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
922: PetscInt vec_start_index = blockIdx.x * entries_per_group;
923: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
925: PetscScalar entry_x = 0;
926: PetscScalar group_sum0 = 0;
927: PetscScalar group_sum1 = 0;
928: PetscScalar group_sum2 = 0;
929: PetscScalar group_sum3 = 0;
930: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
931: entry_x = x[i]; // load only once from global memory!
932: group_sum0 += entry_x * y0[i];
933: group_sum1 += entry_x * y1[i];
934: group_sum2 += entry_x * y2[i];
935: group_sum3 += entry_x * y3[i];
936: }
937: tmp_buffer[threadIdx.x] = group_sum0;
938: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
939: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
940: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
942: // parallel reduction
943: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
944: __syncthreads();
945: if (threadIdx.x < stride) {
946: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
947: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
948: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
949: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
950: }
951: }
953: // write result of group to group_results
954: if (threadIdx.x == 0) {
955: group_results[blockIdx.x ] = tmp_buffer[0];
956: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
957: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
958: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
959: }
960: }
962: // M = 8:
963: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
964: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
965: PetscInt size, PetscScalar *group_results)
966: {
967: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
968: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
969: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
970: PetscInt vec_start_index = blockIdx.x * entries_per_group;
971: PetscInt vec_stop_index = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
973: PetscScalar entry_x = 0;
974: PetscScalar group_sum0 = 0;
975: PetscScalar group_sum1 = 0;
976: PetscScalar group_sum2 = 0;
977: PetscScalar group_sum3 = 0;
978: PetscScalar group_sum4 = 0;
979: PetscScalar group_sum5 = 0;
980: PetscScalar group_sum6 = 0;
981: PetscScalar group_sum7 = 0;
982: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
983: entry_x = x[i]; // load only once from global memory!
984: group_sum0 += entry_x * y0[i];
985: group_sum1 += entry_x * y1[i];
986: group_sum2 += entry_x * y2[i];
987: group_sum3 += entry_x * y3[i];
988: group_sum4 += entry_x * y4[i];
989: group_sum5 += entry_x * y5[i];
990: group_sum6 += entry_x * y6[i];
991: group_sum7 += entry_x * y7[i];
992: }
993: tmp_buffer[threadIdx.x] = group_sum0;
994: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
995: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
996: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
997: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
998: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
999: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1000: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
1002: // parallel reduction
1003: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1004: __syncthreads();
1005: if (threadIdx.x < stride) {
1006: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1007: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
1008: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1009: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1010: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1011: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1012: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1013: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1014: }
1015: }
1017: // write result of group to group_results
1018: if (threadIdx.x == 0) {
1019: group_results[blockIdx.x ] = tmp_buffer[0];
1020: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
1021: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1022: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1023: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1024: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1025: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1026: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1027: }
1028: }
1033: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1034: {
1036: PetscInt i,j,n = xin->map->n,current_y_index = 0;
1037: CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1038: PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1039: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1040: cudaError_t cuda_ierr;
1043: // allocate scratchpad memory for the results of individual work groups:
1044: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1045: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1046: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);
1048: VecCUSPGetArrayRead(xin,&xarray);
1049: xptr = thrust::raw_pointer_cast(xarray->data());
1051: while (current_y_index < nv)
1052: {
1053: switch (nv - current_y_index) {
1055: case 7:
1056: case 6:
1057: case 5:
1058: case 4:
1059: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1060: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1061: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1062: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1064: #if defined(PETSC_USE_COMPLEX)
1065: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1066: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1067: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1068: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1069: #else
1070: // extract raw device pointers:
1071: y0ptr = thrust::raw_pointer_cast(y0array->data());
1072: y1ptr = thrust::raw_pointer_cast(y1array->data());
1073: y2ptr = thrust::raw_pointer_cast(y2array->data());
1074: y3ptr = thrust::raw_pointer_cast(y3array->data());
1076: // run kernel:
1077: VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
1079: // copy results back to
1080: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1081: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1083: // sum group results into z:
1084: for (j=0; j<4; ++j) {
1085: z[current_y_index + j] = 0;
1086: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1087: }
1088: #endif
1089: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1090: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1091: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1092: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1093: current_y_index += 4;
1094: break;
1096: case 3:
1097: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1098: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1099: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1101: #if defined(PETSC_USE_COMPLEX)
1102: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1103: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1104: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1105: #else
1106: // extract raw device pointers:
1107: y0ptr = thrust::raw_pointer_cast(y0array->data());
1108: y1ptr = thrust::raw_pointer_cast(y1array->data());
1109: y2ptr = thrust::raw_pointer_cast(y2array->data());
1111: // run kernel:
1112: VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
1114: // copy results back to
1115: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1116: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1118: // sum group results into z:
1119: for (j=0; j<3; ++j) {
1120: z[current_y_index + j] = 0;
1121: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1122: }
1123: #endif
1125: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1126: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1127: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1128: current_y_index += 3;
1129: break;
1131: case 2:
1132: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1133: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1135: #if defined(PETSC_USE_COMPLEX)
1136: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1137: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1138: #else
1139: // extract raw device pointers:
1140: y0ptr = thrust::raw_pointer_cast(y0array->data());
1141: y1ptr = thrust::raw_pointer_cast(y1array->data());
1143: // run kernel:
1144: VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
1146: // copy results back to
1147: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1148: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1150: // sum group results into z:
1151: for (j=0; j<2; ++j) {
1152: z[current_y_index + j] = 0;
1153: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1154: }
1155: #endif
1156: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1157: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1158: current_y_index += 2;
1159: break;
1161: case 1:
1162: VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1163: #if defined(PETSC_USE_COMPLEX)
1164: z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1165: #else
1166: z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1167: #endif
1168: VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1169: current_y_index += 1;
1170: break;
1172: default: // 8 or more vectors left
1173: VecCUSPGetArrayRead(yin[current_y_index ],&y0array);
1174: VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1175: VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1176: VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1177: VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1178: VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1179: VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1180: VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);
1182: #if defined(PETSC_USE_COMPLEX)
1183: z[current_y_index] = cusp::blas::dot(*y0array,*xarray);
1184: z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1185: z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1186: z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1187: z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1188: z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1189: z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1190: z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1191: #else
1192: // extract raw device pointers:
1193: y0ptr = thrust::raw_pointer_cast(y0array->data());
1194: y1ptr = thrust::raw_pointer_cast(y1array->data());
1195: y2ptr = thrust::raw_pointer_cast(y2array->data());
1196: y3ptr = thrust::raw_pointer_cast(y3array->data());
1197: y4ptr = thrust::raw_pointer_cast(y4array->data());
1198: y5ptr = thrust::raw_pointer_cast(y5array->data());
1199: y6ptr = thrust::raw_pointer_cast(y6array->data());
1200: y7ptr = thrust::raw_pointer_cast(y7array->data());
1202: // run kernel:
1203: VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
1205: // copy results back to
1206: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1207: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);
1209: // sum group results into z:
1210: for (j=0; j<8; ++j) {
1211: z[current_y_index + j] = 0;
1212: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1213: }
1214: #endif
1215: VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);
1216: VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1217: VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1218: VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1219: VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1220: VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1221: VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1222: VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1223: current_y_index += 8;
1224: break;
1225: }
1226: }
1227: VecCUSPRestoreArrayRead(xin,&xarray);
1229: cuda_cudaFree(group_results_gpu);
1230: if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1231: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1232: return(0);
1233: }
1235: #undef MDOT_WORKGROUP_SIZE
1236: #undef MDOT_WORKGROUP_NUM
1242: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1243: {
1244: CUSPARRAY *xarray=NULL;
1248: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1249: VecCUSPGetArrayWrite(xin,&xarray);
1250: try {
1251: cusp::blas::fill(*xarray,alpha);
1252: } catch(char *ex) {
1253: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1254: }
1255: WaitForGPU();CHKERRCUSP(ierr);
1256: VecCUSPRestoreArrayWrite(xin,&xarray);
1257: return(0);
1258: }
1262: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1263: {
1264: CUSPARRAY *xarray;
1268: if (alpha == 0.0) {
1269: VecSet_SeqCUSP(xin,alpha);
1270: } else if (alpha != 1.0) {
1271: VecCUSPGetArrayReadWrite(xin,&xarray);
1272: try {
1273: cusp::blas::scal(*xarray,alpha);
1274: } catch(char *ex) {
1275: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1276: }
1277: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1278: }
1279: WaitForGPU();CHKERRCUSP(ierr);
1280: PetscLogFlops(xin->map->n);
1281: return(0);
1282: }
1287: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1288: {
1289: CUSPARRAY *xarray,*yarray;
1293: //#if defined(PETSC_USE_COMPLEX)
1294: /*Not working for complex*/
1295: //#else
1296: VecCUSPGetArrayRead(xin,&xarray);
1297: VecCUSPGetArrayRead(yin,&yarray);
1298: try {
1299: *z = cusp::blas::dot(*xarray,*yarray);
1300: } catch(char *ex) {
1301: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1302: }
1303: //#endif
1304: WaitForGPU();CHKERRCUSP(ierr);
1305: if (xin->map->n > 0) {
1306: PetscLogFlops(2.0*xin->map->n-1);
1307: }
1308: VecCUSPRestoreArrayRead(yin,&yarray);
1309: VecCUSPRestoreArrayRead(xin,&xarray);
1310: return(0);
1311: }
1314: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1315: {
1316: CUSPARRAY *xarray,*yarray;
1320: if (xin != yin) {
1321: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1322: VecCUSPGetArrayRead(xin,&xarray);
1323: VecCUSPGetArrayWrite(yin,&yarray);
1324: try {
1325: cusp::blas::copy(*xarray,*yarray);
1326: } catch(char *ex) {
1327: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1328: }
1329: WaitForGPU();CHKERRCUSP(ierr);
1330: VecCUSPRestoreArrayRead(xin,&xarray);
1331: VecCUSPRestoreArrayWrite(yin,&yarray);
1333: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1334: /* copy in CPU if we are on the CPU*/
1335: VecCopy_SeqCUSP_Private(xin,yin);
1336: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1337: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1338: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1339: /* copy in CPU */
1340: VecCopy_SeqCUSP_Private(xin,yin);
1342: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1343: /* copy in GPU */
1344: VecCUSPGetArrayRead(xin,&xarray);
1345: VecCUSPGetArrayWrite(yin,&yarray);
1346: try {
1347: cusp::blas::copy(*xarray,*yarray);
1348: WaitForGPU();CHKERRCUSP(ierr);
1349: } catch(char *ex) {
1350: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1351: }
1352: VecCUSPRestoreArrayRead(xin,&xarray);
1353: VecCUSPRestoreArrayWrite(yin,&yarray);
1354: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1355: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1356: default to copy in GPU (this is an arbitrary choice) */
1357: VecCUSPGetArrayRead(xin,&xarray);
1358: VecCUSPGetArrayWrite(yin,&yarray);
1359: try {
1360: cusp::blas::copy(*xarray,*yarray);
1361: WaitForGPU();CHKERRCUSP(ierr);
1362: } catch(char *ex) {
1363: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1364: }
1365: VecCUSPRestoreArrayRead(xin,&xarray);
1366: VecCUSPRestoreArrayWrite(yin,&yarray);
1367: } else {
1368: VecCopy_SeqCUSP_Private(xin,yin);
1369: }
1370: }
1371: }
1372: return(0);
1373: }
1378: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1379: {
1381: PetscBLASInt one = 1,bn;
1382: CUSPARRAY *xarray,*yarray;
1385: PetscBLASIntCast(xin->map->n,&bn);
1386: if (xin != yin) {
1387: VecCUSPGetArrayReadWrite(xin,&xarray);
1388: VecCUSPGetArrayReadWrite(yin,&yarray);
1390: #if defined(PETSC_USE_COMPLEX)
1391: #if defined(PETSC_USE_REAL_SINGLE)
1392: cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1393: #else
1394: cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1395: #endif
1396: #else
1397: #if defined(PETSC_USE_REAL_SINGLE)
1398: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1399: #else
1400: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1401: #endif
1402: #endif
1403: cublasGetError();CHKERRCUSP(ierr);
1404: WaitForGPU();CHKERRCUSP(ierr);
1405: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1406: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1407: }
1408: return(0);
1409: }
1411: struct VecCUSPAX
1412: {
1413: template <typename Tuple>
1414: __host__ __device__
1415: void operator()(Tuple t)
1416: {
1417: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1418: }
1419: };
1422: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1423: {
1425: PetscScalar a = alpha,b = beta;
1426: CUSPARRAY *xarray,*yarray;
1429: if (a == 0.0) {
1430: VecScale_SeqCUSP(yin,beta);
1431: } else if (b == 1.0) {
1432: VecAXPY_SeqCUSP(yin,alpha,xin);
1433: } else if (a == 1.0) {
1434: VecAYPX_SeqCUSP(yin,beta,xin);
1435: } else if (b == 0.0) {
1436: VecCUSPGetArrayRead(xin,&xarray);
1437: VecCUSPGetArrayReadWrite(yin,&yarray);
1438: try {
1439: thrust::for_each(
1440: thrust::make_zip_iterator(
1441: thrust::make_tuple(
1442: yarray->begin(),
1443: thrust::make_constant_iterator(a),
1444: xarray->begin())),
1445: thrust::make_zip_iterator(
1446: thrust::make_tuple(
1447: yarray->end(),
1448: thrust::make_constant_iterator(a),
1449: xarray->end())),
1450: VecCUSPAX());
1451: } catch(char *ex) {
1452: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1453: }
1454: PetscLogFlops(xin->map->n);
1455: WaitForGPU();CHKERRCUSP(ierr);
1456: VecCUSPRestoreArrayRead(xin,&xarray);
1457: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1458: } else {
1459: VecCUSPGetArrayRead(xin,&xarray);
1460: VecCUSPGetArrayReadWrite(yin,&yarray);
1461: try {
1462: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1463: } catch(char *ex) {
1464: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1465: }
1466: VecCUSPRestoreArrayRead(xin,&xarray);
1467: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1468: WaitForGPU();CHKERRCUSP(ierr);
1469: PetscLogFlops(3.0*xin->map->n);
1470: }
1471: return(0);
1472: }
1474: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1475: struct VecCUSPXPBYPCZ
1476: {
1477: /* z = x + b*y + c*z */
1478: template <typename Tuple>
1479: __host__ __device__
1480: void operator()(Tuple t)
1481: {
1482: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1483: }
1484: };
1485: struct VecCUSPAXPBYPZ
1486: {
1487: /* z = ax + b*y + z */
1488: template <typename Tuple>
1489: __host__ __device__
1490: void operator()(Tuple t)
1491: {
1492: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1493: }
1494: };
1498: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1499: {
1501: PetscInt n = zin->map->n;
1502: CUSPARRAY *xarray,*yarray,*zarray;
1505: VecCUSPGetArrayRead(xin,&xarray);
1506: VecCUSPGetArrayRead(yin,&yarray);
1507: VecCUSPGetArrayReadWrite(zin,&zarray);
1508: if (alpha == 1.0) {
1509: try {
1510: thrust::for_each(
1511: thrust::make_zip_iterator(
1512: thrust::make_tuple(
1513: zarray->begin(),
1514: thrust::make_constant_iterator(gamma),
1515: xarray->begin(),
1516: yarray->begin(),
1517: thrust::make_constant_iterator(beta))),
1518: thrust::make_zip_iterator(
1519: thrust::make_tuple(
1520: zarray->end(),
1521: thrust::make_constant_iterator(gamma),
1522: xarray->end(),
1523: yarray->end(),
1524: thrust::make_constant_iterator(beta))),
1525: VecCUSPXPBYPCZ());
1526: } catch(char *ex) {
1527: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1528: }
1529: PetscLogFlops(4.0*n);
1530: } else if (gamma == 1.0) {
1531: try {
1532: thrust::for_each(
1533: thrust::make_zip_iterator(
1534: thrust::make_tuple(
1535: zarray->begin(),
1536: xarray->begin(),
1537: thrust::make_constant_iterator(alpha),
1538: yarray->begin(),
1539: thrust::make_constant_iterator(beta))),
1540: thrust::make_zip_iterator(
1541: thrust::make_tuple(
1542: zarray->end(),
1543: xarray->end(),
1544: thrust::make_constant_iterator(alpha),
1545: yarray->end(),
1546: thrust::make_constant_iterator(beta))),
1547: VecCUSPAXPBYPZ());
1548: } catch(char *ex) {
1549: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1550: }
1551: PetscLogFlops(4.0*n);
1552: } else {
1553: try {
1554: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1555: } catch(char *ex) {
1556: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1557: }
1558: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1559: VecCUSPRestoreArrayRead(xin,&xarray);
1560: VecCUSPRestoreArrayRead(yin,&yarray);
1561: PetscLogFlops(5.0*n);
1562: }
1563: WaitForGPU();CHKERRCUSP(ierr);
1564: return(0);
1565: }
1569: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1570: {
1572: PetscInt n = win->map->n;
1573: CUSPARRAY *xarray,*yarray,*warray;
1576: VecCUSPGetArrayRead(xin,&xarray);
1577: VecCUSPGetArrayRead(yin,&yarray);
1578: VecCUSPGetArrayReadWrite(win,&warray);
1579: try {
1580: cusp::blas::xmy(*xarray,*yarray,*warray);
1581: } catch(char *ex) {
1582: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1583: }
1584: VecCUSPRestoreArrayRead(xin,&xarray);
1585: VecCUSPRestoreArrayRead(yin,&yarray);
1586: VecCUSPRestoreArrayReadWrite(win,&warray);
1587: PetscLogFlops(n);
1588: WaitForGPU();CHKERRCUSP(ierr);
1589: return(0);
1590: }
1593: /* should do infinity norm in cusp */
1597: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1598: {
1599: const PetscScalar *xx;
1600: PetscErrorCode ierr;
1601: PetscInt n = xin->map->n;
1602: PetscBLASInt one = 1, bn;
1603: CUSPARRAY *xarray;
1606: PetscBLASIntCast(n,&bn);
1607: if (type == NORM_2 || type == NORM_FROBENIUS) {
1608: VecCUSPGetArrayRead(xin,&xarray);
1609: try {
1610: *z = cusp::blas::nrm2(*xarray);
1611: } catch(char *ex) {
1612: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1613: }
1614: WaitForGPU();CHKERRCUSP(ierr);
1615: VecCUSPRestoreArrayRead(xin,&xarray);
1616: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1617: } else if (type == NORM_INFINITY) {
1618: PetscInt i;
1619: PetscReal max = 0.0,tmp;
1621: VecGetArrayRead(xin,&xx);
1622: for (i=0; i<n; i++) {
1623: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1624: /* check special case of tmp == NaN */
1625: if (tmp != tmp) {max = tmp; break;}
1626: xx++;
1627: }
1628: VecRestoreArrayRead(xin,&xx);
1629: *z = max;
1630: } else if (type == NORM_1) {
1631: VecCUSPGetArrayRead(xin,&xarray);
1632: #if defined(PETSC_USE_COMPLEX)
1633: #if defined(PETSC_USE_REAL_SINGLE)
1634: *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1635: #else
1636: *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1637: #endif
1638: #else
1639: #if defined(PETSC_USE_REAL_SINGLE)
1640: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1641: #else
1642: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1643: #endif
1644: #endif
1645: cublasGetError();CHKERRCUSP(ierr);
1646: VecCUSPRestoreArrayRead(xin,&xarray);
1647: WaitForGPU();CHKERRCUSP(ierr);
1648: PetscLogFlops(PetscMax(n-1.0,0.0));
1649: } else if (type == NORM_1_AND_2) {
1650: VecNorm_SeqCUSP(xin,NORM_1,z);
1651: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1652: }
1653: return(0);
1654: }
1657: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1661: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1662: {
1666: VecSetRandom_SeqCUSP_Private(xin,r);
1667: xin->valid_GPU_array = PETSC_CUSP_CPU;
1668: return(0);
1669: }
1673: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1674: {
1678: VecCUSPCopyFromGPU(vin);
1679: VecResetArray_SeqCUSP_Private(vin);
1680: vin->valid_GPU_array = PETSC_CUSP_CPU;
1681: return(0);
1682: }
1686: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1687: {
1691: VecCUSPCopyFromGPU(vin);
1692: VecPlaceArray_Seq(vin,a);
1693: vin->valid_GPU_array = PETSC_CUSP_CPU;
1694: return(0);
1695: }
1700: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1701: {
1705: VecCUSPCopyFromGPU(vin);
1706: VecReplaceArray_Seq(vin,a);
1707: vin->valid_GPU_array = PETSC_CUSP_CPU;
1708: return(0);
1709: }
1714: /*@
1715: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1717: Collective on MPI_Comm
1719: Input Parameter:
1720: + comm - the communicator, should be PETSC_COMM_SELF
1721: - n - the vector length
1723: Output Parameter:
1724: . V - the vector
1726: Notes:
1727: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1728: same type as an existing vector.
1730: Level: intermediate
1732: Concepts: vectors^creating sequential
1734: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1735: @*/
1736: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1737: {
1741: VecCreate(comm,v);
1742: VecSetSizes(*v,n,n);
1743: VecSetType(*v,VECSEQCUSP);
1744: return(0);
1745: }
1747: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1748: template <typename T>
1749: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1750: {
1751: __host__ __device__
1752: T operator()(T x)
1753: {
1754: #if defined(PETSC_USE_COMPLEX)
1755: //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1756: #else
1757: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1758: #endif
1759: }
1760: };
1762: template <typename T>
1763: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1764: {
1765: __host__ __device__
1766: T operator()(T x,T y)
1767: {
1768: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1769: }
1770: };
1774: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1775: {
1776: PetscErrorCode ierr;
1777: PetscScalar zero = 0.0;
1778: PetscReal n=s->map->n;
1779: thrust::tuple<PetscScalar,PetscScalar> result;
1780: CUSPARRAY *sarray,*tarray;
1783: /*VecCUSPCopyToGPU(s);
1784: VecCUSPCopyToGPU(t);*/
1785: VecCUSPGetArrayRead(s,&sarray);
1786: VecCUSPGetArrayRead(t,&tarray);
1787: try {
1788: #if defined(PETSC_USE_COMPLEX)
1789: VecDot_SeqCUSP(s,t,dp);
1790: VecDot_SeqCUSP(t,t,nm);
1791: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1792: //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1793: #else
1794: result = thrust::transform_reduce(
1795: thrust::make_zip_iterator(
1796: thrust::make_tuple(
1797: sarray->begin(),
1798: tarray->begin())),
1799: thrust::make_zip_iterator(
1800: thrust::make_tuple(
1801: sarray->end(),
1802: tarray->end())),
1803: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1804: thrust::make_tuple(zero,zero), /*init */
1805: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1806: *dp = thrust::get<0>(result);
1807: *nm = thrust::get<1>(result);
1808: #endif
1809: } catch(char *ex) {
1810: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1811: }
1812: VecCUSPRestoreArrayRead(s,&sarray);
1813: VecCUSPRestoreArrayRead(t,&tarray);
1814: WaitForGPU();CHKERRCUSP(ierr);
1815: PetscLogFlops(4.0*n);
1816: return(0);
1817: }
1821: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1822: {
1826: VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1827: PetscLayoutReference(win->map,&(*V)->map);
1828: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1829: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1830: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1831: return(0);
1832: }
1836: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1837: {
1839: Vec_Seq *s = (Vec_Seq*)v->data;
1840: cudaError_t err;
1842: try {
1843: if (v->spptr) {
1844: delete ((Vec_CUSP*)v->spptr)->GPUarray;
1845: err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);
1847: /* If the host array has been registered as (page-locked) mapped,
1848: one must unregister the buffer */
1849: if (((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked) {
1850: err = cudaHostUnregister(s->array);CHKERRCUSP(err);
1851: }
1852: delete (Vec_CUSP*) v->spptr;
1853: }
1854: } catch(char *ex) {
1855: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1856: }
1857: VecDestroy_SeqCUSP_Private(v);
1858: return(0);
1859: }
1862: #if defined(PETSC_USE_COMPLEX)
1863: struct conjugate
1864: {
1865: __host__ __device__
1866: PetscScalar operator()(PetscScalar x)
1867: {
1868: return cusp::conj(x);
1869: }
1870: };
1871: #endif
1876: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1877: {
1879: CUSPARRAY *xarray;
1882: VecCUSPGetArrayReadWrite(xin,&xarray);
1883: #if defined(PETSC_USE_COMPLEX)
1884: thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1885: #endif
1886: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1887: return(0);
1888: }
1892: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1893: {
1895: PetscMPIInt size;
1898: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1899: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1900: VecCreate_Seq_Private(V,0);
1901: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1903: V->ops->dot = VecDot_SeqCUSP;
1904: V->ops->norm = VecNorm_SeqCUSP;
1905: V->ops->tdot = VecTDot_SeqCUSP;
1906: V->ops->scale = VecScale_SeqCUSP;
1907: V->ops->copy = VecCopy_SeqCUSP;
1908: V->ops->set = VecSet_SeqCUSP;
1909: V->ops->swap = VecSwap_SeqCUSP;
1910: V->ops->axpy = VecAXPY_SeqCUSP;
1911: V->ops->axpby = VecAXPBY_SeqCUSP;
1912: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
1913: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
1914: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1915: V->ops->setrandom = VecSetRandom_SeqCUSP;
1916: V->ops->dot_local = VecDot_SeqCUSP;
1917: V->ops->tdot_local = VecTDot_SeqCUSP;
1918: V->ops->norm_local = VecNorm_SeqCUSP;
1919: V->ops->mdot_local = VecMDot_SeqCUSP;
1920: V->ops->maxpy = VecMAXPY_SeqCUSP;
1921: V->ops->mdot = VecMDot_SeqCUSP;
1922: V->ops->aypx = VecAYPX_SeqCUSP;
1923: V->ops->waxpy = VecWAXPY_SeqCUSP;
1924: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
1925: V->ops->placearray = VecPlaceArray_SeqCUSP;
1926: V->ops->replacearray = VecReplaceArray_SeqCUSP;
1927: V->ops->resetarray = VecResetArray_SeqCUSP;
1928: V->ops->destroy = VecDestroy_SeqCUSP;
1929: V->ops->duplicate = VecDuplicate_SeqCUSP;
1930: V->ops->conjugate = VecConjugate_SeqCUSP;
1932: VecCUSPAllocateCheck(V);
1933: V->valid_GPU_array = PETSC_CUSP_GPU;
1934: VecSet(V,0.0);
1935: return(0);
1936: }
1940: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1941: {
1945: *a = 0;
1946: VecCUSPCopyToGPU(v);
1947: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1948: return(0);
1949: }
1953: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1954: {
1958: v->valid_GPU_array = PETSC_CUSP_GPU;
1960: PetscObjectStateIncrease((PetscObject)v);
1961: return(0);
1962: }
1966: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1967: {
1971: *a = 0;
1972: VecCUSPCopyToGPU(v);
1973: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1974: return(0);
1975: }
1979: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1980: {
1982: return(0);
1983: }
1987: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1988: {
1992: *a = 0;
1993: VecCUSPAllocateCheck(v);
1994: *a = ((Vec_CUSP*)v->spptr)->GPUarray;
1995: return(0);
1996: }
2000: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2001: {
2005: v->valid_GPU_array = PETSC_CUSP_GPU;
2007: PetscObjectStateIncrease((PetscObject)v);
2008: return(0);
2009: }