Actual source code: veccusp.cu

petsc-3.5.4 2015-05-23
Report Typos and Errors
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #include <petscconf.h>
  6: PETSC_CUDA_EXTERN_C_BEGIN
  7: #include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 12: #include <cuda_runtime.h>

 16: /*
 17:     Allocates space for the vector array on the Host if it does not exist.
 18:     Does NOT change the PetscCUSPFlag for the vector
 19:     Does NOT zero the CUSP array
 20:  */
 21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
 22: {
 24:   cudaError_t    err;
 25:   PetscScalar    *array;
 26:   Vec_Seq        *s;
 27:   PetscInt       n = v->map->n;

 30:   s    = (Vec_Seq*)v->data;
 31:   VecCUSPAllocateCheck(v);
 32:   if (s->array == 0) {
 33:     PetscMalloc1(n,&array);
 34:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 35:     s->array           = array;
 36:     s->array_allocated = array;
 37:     err = cudaHostRegister(s->array, n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
 38:     ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
 39:   }
 40:   return(0);
 41: }


 46: /*
 47:     Allocates space for the vector array on the GPU if it does not exist.
 48:     Does NOT change the PetscCUSPFlag for the vector
 49:     Does NOT zero the CUSP array

 51:  */
 52: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 53: {
 54:   cudaError_t    err;
 55:   cudaStream_t   stream;
 56:   Vec_Seq        *s = (Vec_Seq*)v->data;

 59:   // First allocate memory on the GPU if needed
 60:   if (!v->spptr) {
 61:     try {
 62:       v->spptr                        = new Vec_CUSP;
 63:       ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
 64:       ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
 65:       err = cudaStreamCreate(&stream);CHKERRCUSP(err);
 66:       ((Vec_CUSP*)v->spptr)->stream = stream;

 68:       ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 69:       /* If the array is already allocated, one can register it as (page-locked) mapped.
 70:          This can substantially accelerate data transfer across the PCI Express */
 71:       if (s->array) {
 72:         err = cudaHostRegister(s->array, v->map->n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
 73:         ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
 74:       }
 75:       v->ops->destroy = VecDestroy_SeqCUSP;
 76:     } catch(char *ex) {
 77:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 78:     }
 79:   }
 80:   return(0);
 81: }


 86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 87: PetscErrorCode VecCUSPCopyToGPU(Vec v)
 88: {
 90:   cudaError_t    err;
 91:   Vec_CUSP       *veccusp;
 92:   CUSPARRAY      *varray;
 93:   cudaStream_t   stream;

 96:   VecCUSPAllocateCheck(v);
 97:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 98:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
 99:     try {
100:       veccusp=(Vec_CUSP*)v->spptr;
101:       varray=veccusp->GPUarray;
102:       stream=veccusp->stream;
103:       err = cudaMemcpyAsync(varray->data().get(), *(PetscScalar**)v->data, v->map->n*sizeof(PetscScalar),
104:                              cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
105:       err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
106:     } catch(char *ex) {
107:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
108:     }
109:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
110:     v->valid_GPU_array = PETSC_CUSP_BOTH;
111:   }
112:   return(0);
113: }

117: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
118: {
119:   CUSPARRAY      *varray;
121:   cudaError_t    err;
122:   PetscScalar    *cpuPtr, *gpuPtr;
123:   cudaStream_t   stream;
124:   Vec_Seq        *s;
125:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

128:   VecCUSPAllocateCheck(v);
129:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
130:     stream=((Vec_CUSP*)v->spptr)->stream;
131:     s = (Vec_Seq*)v->data;

133:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
134:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
135:     gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
136:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

138:     /* Note : this code copies the smallest contiguous chunk of data
139:        containing ALL of the indices */
140:     err = cudaMemcpyAsync(gpuPtr, cpuPtr, ptop_scatter->nr*sizeof(PetscScalar),
141:                            cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
142:     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);

144: #if 0
145:     Vec_Seq *s;
146:     s = (Vec_Seq*)v->data;

148:     CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
149:     CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;

151:     thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
152:                  thrust::make_permutation_iterator(s->array,indicesCPU->end()),
153:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
154: #endif
155:     // Set the buffer states
156:     v->valid_GPU_array = PETSC_CUSP_BOTH;
157:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
158:   }
159:   return(0);
160: }


165: /*
166:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
167: */
168: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
169: {
171:   cudaError_t    err;
172:   Vec_CUSP       *veccusp;
173:   CUSPARRAY      *varray;
174:   cudaStream_t   stream;

177:   VecCUSPAllocateCheckHost(v);
178:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
179:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
180:     try {
181:       veccusp=(Vec_CUSP*)v->spptr;
182:       varray=veccusp->GPUarray;
183:       stream=veccusp->stream;

185:       err = cudaMemcpyAsync(*(PetscScalar**)v->data, varray->data().get(), v->map->n*sizeof(PetscScalar),
186:                              cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
187:       err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
188:     } catch(char *ex) {
189:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190:     }
191:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192:     v->valid_GPU_array = PETSC_CUSP_BOTH;
193:   }
194:   return(0);
195: }

199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200:    which means that we need recombine the data at some point before using any of the standard functions.
201:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202:    where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206:   CUSPARRAY      *varray;
208:   cudaError_t    err;
209:   PetscScalar    *cpuPtr, *gpuPtr;
210:   cudaStream_t   stream;
211:   Vec_Seq        *s;
212:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

215:   VecCUSPAllocateCheckHost(v);
216:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
217:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);

219:     stream=((Vec_CUSP*)v->spptr)->stream;
220:     varray=((Vec_CUSP*)v->spptr)->GPUarray;
221:     s = (Vec_Seq*)v->data;
222:     gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
223:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

225:     /* Note : this code copies the smallest contiguous chunk of data
226:        containing ALL of the indices */
227:     err = cudaMemcpyAsync(cpuPtr, gpuPtr, ptop_scatter->ns*sizeof(PetscScalar),
228:                            cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
229:     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);

231: #if 0
232:     Vec_Seq *s;
233:     s = (Vec_Seq*)v->data;
234:     CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
235:     CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;

237:     thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
238:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
239:                  thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
240: #endif
241:     VecCUSPRestoreArrayRead(v,&varray);
242:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
243:     v->valid_GPU_array = PETSC_CUSP_BOTH;
244:   }
245:   return(0);
246: }

250: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
251: {
252:   PetscScalar       *ya;
253:   const PetscScalar *xa;
254:   PetscErrorCode    ierr;

257:   VecCUSPAllocateCheckHost(xin);
258:   VecCUSPAllocateCheckHost(yin);
259:   if (xin != yin) {
260:     VecGetArrayRead(xin,&xa);
261:     VecGetArray(yin,&ya);
262:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
263:     VecRestoreArrayRead(xin,&xa);
264:     VecRestoreArray(yin,&ya);
265:   }
266:   return(0);
267: }

271: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
272: {
274:   PetscInt       n = xin->map->n,i;
275:   PetscScalar    *xx;

278:   VecGetArray(xin,&xx);
279:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
280:   VecRestoreArray(xin,&xx);
281:   return(0);
282: }

286: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
287: {
288:   Vec_Seq        *vs = (Vec_Seq*)v->data;

292:   PetscObjectSAWsViewOff(v);
293: #if defined(PETSC_USE_LOG)
294:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
295: #endif
296:   if (vs->array_allocated) PetscFree(vs->array_allocated);
297:   PetscFree(vs);
298:   return(0);
299: }

303: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
304: {
305:   Vec_Seq *v = (Vec_Seq*)vin->data;

308:   v->array         = v->unplacedarray;
309:   v->unplacedarray = 0;
310:   return(0);
311: }

313: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
316: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
317: {

321:   VecCUSPAllocateCheck(v);
322:   return(0);
323: }

327: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
328: {

332:   VecCUSPCopyToGPU(v);
333:   return(0);
334: }



340: /*
341:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

343:    Input Parameters:
344: +    v - the vector
345: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

347: */
348: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
349: {

353:   VecCUSPCopyToGPUSome(v,ci);
354:   return(0);
355: }

359: /*
360:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

362:   Input Parameters:
363:  +    v - the vector
364:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
365: */
366: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
367: {

371:   VecCUSPCopyFromGPUSome(v,ci);
372:   return(0);
373: }

375: /*MC
376:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

378:    Options Database Keys:
379: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

381:   Level: beginner

383: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
384: M*/

386: /* for VecAYPX_SeqCUSP*/
387: namespace cusp
388: {
389: namespace blas
390: {
391: namespace detail
392: {
393:   template <typename T>
394:     struct AYPX : public thrust::binary_function<T,T,T>
395:     {
396:       T alpha;

398:       AYPX(T _alpha) : alpha(_alpha) {}

400:       __host__ __device__
401:       T operator()(T x, T y)
402:       {
403:         return alpha * y + x;
404:       }
405:     };
406: }

408:  template <typename ForwardIterator1,
409:            typename ForwardIterator2,
410:            typename ScalarType>
411: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
412:            {
413:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
414:            }
415:  template <typename Array1, typename Array2, typename ScalarType>
416:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
417:  {
418:    detail::assert_same_dimensions(x,y);
419:    aypx(x.begin(),x.end(),y.begin(),alpha);
420:  }
421: }
422: }

426: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
427: {
428:   CUSPARRAY      *xarray,*yarray;

432:   VecCUSPGetArrayRead(xin,&xarray);
433:   VecCUSPGetArrayReadWrite(yin,&yarray);
434:   try {
435:     if (alpha != 0.0) {
436:       cusp::blas::aypx(*xarray,*yarray,alpha);
437:       PetscLogFlops(2.0*yin->map->n);
438:     } else {
439:       cusp::blas::copy(*xarray,*yarray);
440:     }
441:     WaitForGPU();CHKERRCUSP(ierr);
442:   } catch(char *ex) {
443:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
444:   }
445:   VecCUSPRestoreArrayRead(xin,&xarray);
446:   VecCUSPRestoreArrayReadWrite(yin,&yarray);
447:   return(0);
448: }


453: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
454: {
455:   CUSPARRAY      *xarray,*yarray;

459:   if (alpha != 0.0) {
460:     VecCUSPGetArrayRead(xin,&xarray);
461:     VecCUSPGetArrayReadWrite(yin,&yarray);
462:     try {
463:       cusp::blas::axpy(*xarray,*yarray,alpha);
464:       WaitForGPU();CHKERRCUSP(ierr);
465:     } catch(char *ex) {
466:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
467:     }
468:     VecCUSPRestoreArrayRead(xin,&xarray);
469:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
470:     PetscLogFlops(2.0*yin->map->n);
471:   }
472:   return(0);
473: }

475: struct VecCUSPPointwiseDivide
476: {
477:   template <typename Tuple>
478:   __host__ __device__
479:   void operator()(Tuple t)
480:   {
481:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
482:   }
483: };

487: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
488: {
489:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

493:   VecCUSPGetArrayRead(xin,&xarray);
494:   VecCUSPGetArrayRead(yin,&yarray);
495:   VecCUSPGetArrayWrite(win,&warray);
496:   try {
497:     thrust::for_each(
498:       thrust::make_zip_iterator(
499:         thrust::make_tuple(
500:           warray->begin(),
501:           xarray->begin(),
502:           yarray->begin())),
503:       thrust::make_zip_iterator(
504:         thrust::make_tuple(
505:           warray->end(),
506:           xarray->end(),
507:           yarray->end())),
508:       VecCUSPPointwiseDivide());
509:     WaitForGPU();CHKERRCUSP(ierr);
510:   } catch(char *ex) {
511:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
512:   }
513:   PetscLogFlops(win->map->n);
514:   VecCUSPRestoreArrayRead(xin,&xarray);
515:   VecCUSPRestoreArrayRead(yin,&yarray);
516:   VecCUSPRestoreArrayWrite(win,&warray);
517:   return(0);
518: }


521: struct VecCUSPWAXPY
522: {
523:   template <typename Tuple>
524:   __host__ __device__
525:   void operator()(Tuple t)
526:   {
527:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
528:   }
529: };

531: struct VecCUSPSum
532: {
533:   template <typename Tuple>
534:   __host__ __device__
535:   void operator()(Tuple t)
536:   {
537:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
538:   }
539: };

541: struct VecCUSPDiff
542: {
543:   template <typename Tuple>
544:   __host__ __device__
545:   void operator()(Tuple t)
546:   {
547:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
548:   }
549: };

553: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
554: {
555:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

559:   if (alpha == 0.0) {
560:     VecCopy_SeqCUSP(yin,win);
561:   } else {
562:     VecCUSPGetArrayRead(xin,&xarray);
563:     VecCUSPGetArrayRead(yin,&yarray);
564:     VecCUSPGetArrayWrite(win,&warray);
565:     if (alpha == 1.0) {
566:       try {
567:         thrust::for_each(
568:           thrust::make_zip_iterator(
569:             thrust::make_tuple(
570:               warray->begin(),
571:               yarray->begin(),
572:               xarray->begin())),
573:           thrust::make_zip_iterator(
574:             thrust::make_tuple(
575:               warray->end(),
576:               yarray->end(),
577:               xarray->end())),
578:           VecCUSPSum());
579:       } catch(char *ex) {
580:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
581:       }
582:       PetscLogFlops(win->map->n);
583:     } else if (alpha == -1.0) {
584:       try {
585:         thrust::for_each(
586:           thrust::make_zip_iterator(
587:             thrust::make_tuple(
588:               warray->begin(),
589:               yarray->begin(),
590:               xarray->begin())),
591:           thrust::make_zip_iterator(
592:             thrust::make_tuple(
593:               warray->end(),
594:               yarray->end(),
595:               xarray->end())),
596:           VecCUSPDiff());
597:       } catch(char *ex) {
598:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
599:       }
600:       PetscLogFlops(win->map->n);
601:     } else {
602:       try {
603:         thrust::for_each(
604:           thrust::make_zip_iterator(
605:             thrust::make_tuple(
606:               warray->begin(),
607:               yarray->begin(),
608:               thrust::make_constant_iterator(alpha),
609:               xarray->begin())),
610:           thrust::make_zip_iterator(
611:             thrust::make_tuple(
612:               warray->end(),
613:               yarray->end(),
614:               thrust::make_constant_iterator(alpha),
615:               xarray->end())),
616:           VecCUSPWAXPY());
617:       } catch(char *ex) {
618:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
619:       }
620:       PetscLogFlops(2*win->map->n);
621:     }
622:     WaitForGPU();CHKERRCUSP(ierr);
623:     VecCUSPRestoreArrayRead(xin,&xarray);
624:     VecCUSPRestoreArrayRead(yin,&yarray);
625:     VecCUSPRestoreArrayWrite(win,&warray);
626:   }
627:   return(0);
628: }

630: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
631: struct VecCUSPMAXPY4
632: {
633:   template <typename Tuple>
634:   __host__ __device__
635:   void operator()(Tuple t)
636:   {
637:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
638:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
639:   }
640: };


643: struct VecCUSPMAXPY3
644: {
645:   template <typename Tuple>
646:   __host__ __device__
647:   void operator()(Tuple t)
648:   {
649:     /*y += a1*x1 +a2*x2 + a3*x3 */
650:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
651:   }
652: };

654: struct VecCUSPMAXPY2
655: {
656:   template <typename Tuple>
657:   __host__ __device__
658:   void operator()(Tuple t)
659:   {
660:     /*y += a1*x1 +a2*x2*/
661:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
662:   }
663: };
666: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
667: {
669:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
670:   PetscInt       n = xin->map->n,j,j_rem;
671:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

674:   PetscLogFlops(nv*2.0*n);
675:   VecCUSPGetArrayReadWrite(xin,&xarray);
676:   switch (j_rem=nv&0x3) {
677:   case 3:
678:     alpha0 = alpha[0];
679:     alpha1 = alpha[1];
680:     alpha2 = alpha[2];
681:     alpha += 3;
682:     VecCUSPGetArrayRead(y[0],&yy0);
683:     VecCUSPGetArrayRead(y[1],&yy1);
684:     VecCUSPGetArrayRead(y[2],&yy2);
685:     try {
686:       thrust::for_each(
687:         thrust::make_zip_iterator(
688:           thrust::make_tuple(
689:             xarray->begin(),
690:             thrust::make_constant_iterator(alpha0),
691:             yy0->begin(),
692:             thrust::make_constant_iterator(alpha1),
693:             yy1->begin(),
694:             thrust::make_constant_iterator(alpha2),
695:             yy2->begin())),
696:         thrust::make_zip_iterator(
697:           thrust::make_tuple(
698:             xarray->end(),
699:             thrust::make_constant_iterator(alpha0),
700:             yy0->end(),
701:             thrust::make_constant_iterator(alpha1),
702:             yy1->end(),
703:             thrust::make_constant_iterator(alpha2),
704:             yy2->end())),
705:         VecCUSPMAXPY3());
706:     } catch(char *ex) {
707:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
708:     }
709:     VecCUSPRestoreArrayRead(y[0],&yy0);
710:     VecCUSPRestoreArrayRead(y[1],&yy1);
711:     VecCUSPRestoreArrayRead(y[2],&yy2);
712:     y   += 3;
713:     break;
714:   case 2:
715:     alpha0 = alpha[0];
716:     alpha1 = alpha[1];
717:     alpha +=2;
718:     VecCUSPGetArrayRead(y[0],&yy0);
719:     VecCUSPGetArrayRead(y[1],&yy1);
720:     try {
721:       thrust::for_each(
722:         thrust::make_zip_iterator(
723:           thrust::make_tuple(
724:             xarray->begin(),
725:             thrust::make_constant_iterator(alpha0),
726:             yy0->begin(),
727:             thrust::make_constant_iterator(alpha1),
728:             yy1->begin())),
729:         thrust::make_zip_iterator(
730:           thrust::make_tuple(
731:             xarray->end(),
732:             thrust::make_constant_iterator(alpha0),
733:             yy0->end(),
734:             thrust::make_constant_iterator(alpha1),
735:             yy1->end())),
736:         VecCUSPMAXPY2());
737:     } catch(char *ex) {
738:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
739:     }
740:     y +=2;
741:     break;
742:   case 1:
743:     alpha0 = *alpha++;
744:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
745:     y     +=1;
746:     break;
747:   }
748:   for (j=j_rem; j<nv; j+=4) {
749:     alpha0 = alpha[0];
750:     alpha1 = alpha[1];
751:     alpha2 = alpha[2];
752:     alpha3 = alpha[3];
753:     alpha += 4;
754:     VecCUSPGetArrayRead(y[0],&yy0);
755:     VecCUSPGetArrayRead(y[1],&yy1);
756:     VecCUSPGetArrayRead(y[2],&yy2);
757:     VecCUSPGetArrayRead(y[3],&yy3);
758:     try {
759:       thrust::for_each(
760:         thrust::make_zip_iterator(
761:           thrust::make_tuple(
762:             xarray->begin(),
763:             thrust::make_constant_iterator(alpha0),
764:             yy0->begin(),
765:             thrust::make_constant_iterator(alpha1),
766:             yy1->begin(),
767:             thrust::make_constant_iterator(alpha2),
768:             yy2->begin(),
769:             thrust::make_constant_iterator(alpha3),
770:             yy3->begin())),
771:         thrust::make_zip_iterator(
772:           thrust::make_tuple(
773:             xarray->end(),
774:             thrust::make_constant_iterator(alpha0),
775:             yy0->end(),
776:             thrust::make_constant_iterator(alpha1),
777:             yy1->end(),
778:             thrust::make_constant_iterator(alpha2),
779:             yy2->end(),
780:             thrust::make_constant_iterator(alpha3),
781:             yy3->end())),
782:         VecCUSPMAXPY4());
783:     } catch(char *ex) {
784:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
785:     }
786:     VecCUSPRestoreArrayRead(y[0],&yy0);
787:     VecCUSPRestoreArrayRead(y[1],&yy1);
788:     VecCUSPRestoreArrayRead(y[2],&yy2);
789:     VecCUSPRestoreArrayRead(y[3],&yy3);
790:     y   += 4;
791:   }
792:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
793:   WaitForGPU();CHKERRCUSP(ierr);
794:   return(0);
795: }


800: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
801: {
802:   CUSPARRAY      *xarray,*yarray;
804:   //  PetscScalar    *xptr,*yptr,*zgpu;
805:   //PetscReal tmp;

808:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
809:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
810:   VecCUSPGetArrayRead(xin,&xarray);
811:   VecCUSPGetArrayRead(yin,&yarray);
812:   try {
813: #if defined(PETSC_USE_COMPLEX)
814:     *z = cusp::blas::dotc(*yarray,*xarray);
815: #else
816:     *z = cusp::blas::dot(*yarray,*xarray);
817: #endif
818:   } catch(char *ex) {
819:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
820:   }
821:   WaitForGPU();CHKERRCUSP(ierr);
822:   if (xin->map->n >0) {
823:     PetscLogFlops(2.0*xin->map->n-1);
824:   }
825:   VecCUSPRestoreArrayRead(xin,&xarray);
826:   VecCUSPRestoreArrayRead(yin,&yarray);
827:   return(0);
828: }

830: //
831: // CUDA kernels for MDot to follow
832: //

834: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
835: #define MDOT_WORKGROUP_SIZE 128
836: #define MDOT_WORKGROUP_NUM  128

838: // M = 2:
839: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
840:                                         PetscInt size, PetscScalar *group_results)
841: {
842:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
843:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
844:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
845:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
846:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

848:   PetscScalar entry_x    = 0;
849:   PetscScalar group_sum0 = 0;
850:   PetscScalar group_sum1 = 0;
851:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
852:     entry_x     = x[i];   // load only once from global memory!
853:     group_sum0 += entry_x * y0[i];
854:     group_sum1 += entry_x * y1[i];
855:   }
856:   tmp_buffer[threadIdx.x]                       = group_sum0;
857:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

859:   // parallel reduction
860:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
861:     __syncthreads();
862:     if (threadIdx.x < stride) {
863:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
864:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
865:     }
866:   }

868:   // write result of group to group_results
869:   if (threadIdx.x == 0) {
870:     group_results[blockIdx.x]             = tmp_buffer[0];
871:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
872:   }
873: }

875: // M = 3:
876: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
877:                                         PetscInt size, PetscScalar *group_results)
878: {
879:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
880:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
881:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
882:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
883:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

885:   PetscScalar entry_x    = 0;
886:   PetscScalar group_sum0 = 0;
887:   PetscScalar group_sum1 = 0;
888:   PetscScalar group_sum2 = 0;
889:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
890:     entry_x     = x[i];   // load only once from global memory!
891:     group_sum0 += entry_x * y0[i];
892:     group_sum1 += entry_x * y1[i];
893:     group_sum2 += entry_x * y2[i];
894:   }
895:   tmp_buffer[threadIdx.x]                           = group_sum0;
896:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
897:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

899:   // parallel reduction
900:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
901:     __syncthreads();
902:     if (threadIdx.x < stride) {
903:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
904:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
905:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
906:     }
907:   }

909:   // write result of group to group_results
910:   if (threadIdx.x == 0) {
911:     group_results[blockIdx.x                ] = tmp_buffer[0];
912:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
913:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
914:   }
915: }

917: // M = 4:
918: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
919:                                         PetscInt size, PetscScalar *group_results)
920: {
921:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
922:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
923:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
924:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
925:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

927:   PetscScalar entry_x    = 0;
928:   PetscScalar group_sum0 = 0;
929:   PetscScalar group_sum1 = 0;
930:   PetscScalar group_sum2 = 0;
931:   PetscScalar group_sum3 = 0;
932:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
933:     entry_x     = x[i];   // load only once from global memory!
934:     group_sum0 += entry_x * y0[i];
935:     group_sum1 += entry_x * y1[i];
936:     group_sum2 += entry_x * y2[i];
937:     group_sum3 += entry_x * y3[i];
938:   }
939:   tmp_buffer[threadIdx.x]                           = group_sum0;
940:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
941:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
942:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

944:   // parallel reduction
945:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
946:     __syncthreads();
947:     if (threadIdx.x < stride) {
948:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
949:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
950:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
951:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
952:     }
953:   }

955:   // write result of group to group_results
956:   if (threadIdx.x == 0) {
957:     group_results[blockIdx.x                ] = tmp_buffer[0];
958:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
959:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
960:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
961:   }
962: }

964: // M = 8:
965: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
966:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
967:                                           PetscInt size, PetscScalar *group_results)
968: {
969:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
970:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
971:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
972:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
973:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

975:   PetscScalar entry_x    = 0;
976:   PetscScalar group_sum0 = 0;
977:   PetscScalar group_sum1 = 0;
978:   PetscScalar group_sum2 = 0;
979:   PetscScalar group_sum3 = 0;
980:   PetscScalar group_sum4 = 0;
981:   PetscScalar group_sum5 = 0;
982:   PetscScalar group_sum6 = 0;
983:   PetscScalar group_sum7 = 0;
984:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
985:     entry_x     = x[i];   // load only once from global memory!
986:     group_sum0 += entry_x * y0[i];
987:     group_sum1 += entry_x * y1[i];
988:     group_sum2 += entry_x * y2[i];
989:     group_sum3 += entry_x * y3[i];
990:     group_sum4 += entry_x * y4[i];
991:     group_sum5 += entry_x * y5[i];
992:     group_sum6 += entry_x * y6[i];
993:     group_sum7 += entry_x * y7[i];
994:   }
995:   tmp_buffer[threadIdx.x]                           = group_sum0;
996:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
997:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
998:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
999:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
1000:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
1001:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1002:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

1004:   // parallel reduction
1005:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1006:     __syncthreads();
1007:     if (threadIdx.x < stride) {
1008:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
1009:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
1010:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1011:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1012:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1013:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1014:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1015:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1016:     }
1017:   }

1019:   // write result of group to group_results
1020:   if (threadIdx.x == 0) {
1021:     group_results[blockIdx.x                ] = tmp_buffer[0];
1022:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
1023:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1024:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1025:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1026:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1027:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1028:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1029:   }
1030: }


1035: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1036: {
1038:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
1039:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1040:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1041:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1042:   cudaError_t    cuda_ierr;

1045:   // allocate scratchpad memory for the results of individual work groups:
1046:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1047:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1048:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

1050:   VecCUSPGetArrayRead(xin,&xarray);
1051:   xptr = thrust::raw_pointer_cast(xarray->data());

1053:   while (current_y_index < nv)
1054:   {
1055:     switch (nv - current_y_index) {

1057:     case 7:
1058:     case 6:
1059:     case 5:
1060:     case 4:
1061:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1062:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1063:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1064:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

1066: #if defined(PETSC_USE_COMPLEX)
1067:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1068:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1069:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1070:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1071: #else
1072:       // extract raw device pointers:
1073:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1074:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1075:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1076:       y3ptr = thrust::raw_pointer_cast(y3array->data());

1078:       // run kernel:
1079:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

1081:       // copy results back to
1082:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1083:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1085:       // sum group results into z:
1086:       for (j=0; j<4; ++j) {
1087:         z[current_y_index + j] = 0;
1088:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1089:       }
1090: #endif
1091:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1092:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1093:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1094:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1095:       current_y_index += 4;
1096:       break;

1098:     case 3:
1099:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1100:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1101:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

1103: #if defined(PETSC_USE_COMPLEX)
1104:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1105:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1106:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1107: #else
1108:       // extract raw device pointers:
1109:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1110:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1111:       y2ptr = thrust::raw_pointer_cast(y2array->data());

1113:       // run kernel:
1114:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

1116:       // copy results back to
1117:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1118:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1120:       // sum group results into z:
1121:       for (j=0; j<3; ++j) {
1122:         z[current_y_index + j] = 0;
1123:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1124:       }
1125: #endif

1127:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1128:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1129:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1130:       current_y_index += 3;
1131:       break;

1133:     case 2:
1134:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1135:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

1137: #if defined(PETSC_USE_COMPLEX)
1138:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1139:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1140: #else
1141:       // extract raw device pointers:
1142:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1143:       y1ptr = thrust::raw_pointer_cast(y1array->data());

1145:       // run kernel:
1146:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

1148:       // copy results back to
1149:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1150:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1152:       // sum group results into z:
1153:       for (j=0; j<2; ++j) {
1154:         z[current_y_index + j] = 0;
1155:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1156:       }
1157: #endif
1158:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1159:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1160:       current_y_index += 2;
1161:       break;

1163:     case 1:
1164:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1165: #if defined(PETSC_USE_COMPLEX)
1166:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1167: #else
1168:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1169: #endif
1170:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1171:       current_y_index += 1;
1172:       break;

1174:     default: // 8 or more vectors left
1175:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1176:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1177:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1178:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1179:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1180:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1181:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1182:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

1184: #if defined(PETSC_USE_COMPLEX)
1185:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1186:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1187:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1188:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1189:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1190:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1191:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1192:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1193: #else
1194:       // extract raw device pointers:
1195:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1196:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1197:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1198:       y3ptr = thrust::raw_pointer_cast(y3array->data());
1199:       y4ptr = thrust::raw_pointer_cast(y4array->data());
1200:       y5ptr = thrust::raw_pointer_cast(y5array->data());
1201:       y6ptr = thrust::raw_pointer_cast(y6array->data());
1202:       y7ptr = thrust::raw_pointer_cast(y7array->data());

1204:       // run kernel:
1205:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1207:       // copy results back to
1208:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1209:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1211:       // sum group results into z:
1212:       for (j=0; j<8; ++j) {
1213:         z[current_y_index + j] = 0;
1214:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1215:       }
1216: #endif
1217:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1218:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1219:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1220:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1221:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1222:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1223:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1224:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1225:       current_y_index += 8;
1226:       break;
1227:     }
1228:   }
1229:   VecCUSPRestoreArrayRead(xin,&xarray);

1231:   cuda_cudaFree(group_results_gpu);
1232:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1233:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1234:   return(0);
1235: }

1237: #undef MDOT_WORKGROUP_SIZE
1238: #undef MDOT_WORKGROUP_NUM



1244: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1245: {
1246:   CUSPARRAY      *xarray=NULL;

1250:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1251:   VecCUSPGetArrayWrite(xin,&xarray);
1252:   try {
1253:     cusp::blas::fill(*xarray,alpha);
1254:   } catch(char *ex) {
1255:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1256:   }
1257:   WaitForGPU();CHKERRCUSP(ierr);
1258:   VecCUSPRestoreArrayWrite(xin,&xarray);
1259:   return(0);
1260: }

1264: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1265: {
1266:   CUSPARRAY      *xarray;

1270:   if (alpha == 0.0) {
1271:     VecSet_SeqCUSP(xin,alpha);
1272:   } else if (alpha != 1.0) {
1273:     VecCUSPGetArrayReadWrite(xin,&xarray);
1274:     try {
1275:       cusp::blas::scal(*xarray,alpha);
1276:     } catch(char *ex) {
1277:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1278:     }
1279:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1280:   }
1281:   WaitForGPU();CHKERRCUSP(ierr);
1282:   PetscLogFlops(xin->map->n);
1283:   return(0);
1284: }


1289: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1290: {
1291:   CUSPARRAY      *xarray,*yarray;

1295:   //#if defined(PETSC_USE_COMPLEX)
1296:   /*Not working for complex*/
1297:   //#else
1298:   VecCUSPGetArrayRead(xin,&xarray);
1299:   VecCUSPGetArrayRead(yin,&yarray);
1300:   try {
1301:     *z = cusp::blas::dot(*xarray,*yarray);
1302:   } catch(char *ex) {
1303:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1304:   }
1305:   //#endif
1306:   WaitForGPU();CHKERRCUSP(ierr);
1307:   if (xin->map->n > 0) {
1308:     PetscLogFlops(2.0*xin->map->n-1);
1309:   }
1310:   VecCUSPRestoreArrayRead(yin,&yarray);
1311:   VecCUSPRestoreArrayRead(xin,&xarray);
1312:   return(0);
1313: }
1316: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1317: {
1318:   CUSPARRAY      *xarray,*yarray;

1322:   if (xin != yin) {
1323:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1324:       VecCUSPGetArrayRead(xin,&xarray);
1325:       VecCUSPGetArrayWrite(yin,&yarray);
1326:       try {
1327:         cusp::blas::copy(*xarray,*yarray);
1328:       } catch(char *ex) {
1329:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1330:       }
1331:       WaitForGPU();CHKERRCUSP(ierr);
1332:       VecCUSPRestoreArrayRead(xin,&xarray);
1333:       VecCUSPRestoreArrayWrite(yin,&yarray);

1335:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1336:       /* copy in CPU if we are on the CPU*/
1337:       VecCopy_SeqCUSP_Private(xin,yin);
1338:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1339:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1340:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1341:         /* copy in CPU */
1342:         VecCopy_SeqCUSP_Private(xin,yin);

1344:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1345:         /* copy in GPU */
1346:         VecCUSPGetArrayRead(xin,&xarray);
1347:         VecCUSPGetArrayWrite(yin,&yarray);
1348:         try {
1349:           cusp::blas::copy(*xarray,*yarray);
1350:           WaitForGPU();CHKERRCUSP(ierr);
1351:         } catch(char *ex) {
1352:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1353:         }
1354:         VecCUSPRestoreArrayRead(xin,&xarray);
1355:         VecCUSPRestoreArrayWrite(yin,&yarray);
1356:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1357:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1358:            default to copy in GPU (this is an arbitrary choice) */
1359:         VecCUSPGetArrayRead(xin,&xarray);
1360:         VecCUSPGetArrayWrite(yin,&yarray);
1361:         try {
1362:           cusp::blas::copy(*xarray,*yarray);
1363:           WaitForGPU();CHKERRCUSP(ierr);
1364:         } catch(char *ex) {
1365:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1366:         }
1367:         VecCUSPRestoreArrayRead(xin,&xarray);
1368:         VecCUSPRestoreArrayWrite(yin,&yarray);
1369:       } else {
1370:         VecCopy_SeqCUSP_Private(xin,yin);
1371:       }
1372:     }
1373:   }
1374:   return(0);
1375: }


1380: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1381: {
1383:   PetscBLASInt   one = 1,bn;
1384:   CUSPARRAY      *xarray,*yarray;

1387:   PetscBLASIntCast(xin->map->n,&bn);
1388:   if (xin != yin) {
1389:     VecCUSPGetArrayReadWrite(xin,&xarray);
1390:     VecCUSPGetArrayReadWrite(yin,&yarray);

1392: #if defined(PETSC_USE_COMPLEX)
1393: #if defined(PETSC_USE_REAL_SINGLE)
1394:     cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1395: #else
1396:     cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1397: #endif
1398: #else
1399: #if defined(PETSC_USE_REAL_SINGLE)
1400:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1401: #else
1402:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1403: #endif
1404: #endif
1405:     cublasGetError();CHKERRCUSP(ierr);
1406:     WaitForGPU();CHKERRCUSP(ierr);
1407:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1408:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1409:   }
1410:   return(0);
1411: }

1413: struct VecCUSPAX
1414: {
1415:   template <typename Tuple>
1416:   __host__ __device__
1417:   void operator()(Tuple t)
1418:   {
1419:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1420:   }
1421: };
1424: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1425: {
1427:   PetscScalar    a = alpha,b = beta;
1428:   CUSPARRAY      *xarray,*yarray;

1431:   if (a == 0.0) {
1432:     VecScale_SeqCUSP(yin,beta);
1433:   } else if (b == 1.0) {
1434:     VecAXPY_SeqCUSP(yin,alpha,xin);
1435:   } else if (a == 1.0) {
1436:     VecAYPX_SeqCUSP(yin,beta,xin);
1437:   } else if (b == 0.0) {
1438:     VecCUSPGetArrayRead(xin,&xarray);
1439:     VecCUSPGetArrayReadWrite(yin,&yarray);
1440:     try {
1441:       thrust::for_each(
1442:         thrust::make_zip_iterator(
1443:           thrust::make_tuple(
1444:             yarray->begin(),
1445:             thrust::make_constant_iterator(a),
1446:             xarray->begin())),
1447:         thrust::make_zip_iterator(
1448:           thrust::make_tuple(
1449:             yarray->end(),
1450:             thrust::make_constant_iterator(a),
1451:             xarray->end())),
1452:         VecCUSPAX());
1453:     } catch(char *ex) {
1454:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1455:     }
1456:     PetscLogFlops(xin->map->n);
1457:     WaitForGPU();CHKERRCUSP(ierr);
1458:     VecCUSPRestoreArrayRead(xin,&xarray);
1459:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1460:   } else {
1461:     VecCUSPGetArrayRead(xin,&xarray);
1462:     VecCUSPGetArrayReadWrite(yin,&yarray);
1463:     try {
1464:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1465:     } catch(char *ex) {
1466:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1467:     }
1468:     VecCUSPRestoreArrayRead(xin,&xarray);
1469:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1470:     WaitForGPU();CHKERRCUSP(ierr);
1471:     PetscLogFlops(3.0*xin->map->n);
1472:   }
1473:   return(0);
1474: }

1476: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1477: struct VecCUSPXPBYPCZ
1478: {
1479:   /* z = x + b*y + c*z */
1480:   template <typename Tuple>
1481:   __host__ __device__
1482:   void operator()(Tuple t)
1483:   {
1484:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1485:   }
1486: };
1487: struct VecCUSPAXPBYPZ
1488: {
1489:   /* z = ax + b*y + z */
1490:   template <typename Tuple>
1491:   __host__ __device__
1492:   void operator()(Tuple t)
1493:   {
1494:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1495:   }
1496: };

1500: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1501: {
1503:   PetscInt       n = zin->map->n;
1504:   CUSPARRAY      *xarray,*yarray,*zarray;

1507:   VecCUSPGetArrayRead(xin,&xarray);
1508:   VecCUSPGetArrayRead(yin,&yarray);
1509:   VecCUSPGetArrayReadWrite(zin,&zarray);
1510:   if (alpha == 1.0) {
1511:     try {
1512:       thrust::for_each(
1513:         thrust::make_zip_iterator(
1514:           thrust::make_tuple(
1515:             zarray->begin(),
1516:             thrust::make_constant_iterator(gamma),
1517:             xarray->begin(),
1518:             yarray->begin(),
1519:             thrust::make_constant_iterator(beta))),
1520:         thrust::make_zip_iterator(
1521:           thrust::make_tuple(
1522:             zarray->end(),
1523:             thrust::make_constant_iterator(gamma),
1524:             xarray->end(),
1525:             yarray->end(),
1526:             thrust::make_constant_iterator(beta))),
1527:         VecCUSPXPBYPCZ());
1528:     } catch(char *ex) {
1529:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1530:     }
1531:     PetscLogFlops(4.0*n);
1532:   } else if (gamma == 1.0) {
1533:     try {
1534:       thrust::for_each(
1535:         thrust::make_zip_iterator(
1536:           thrust::make_tuple(
1537:             zarray->begin(),
1538:             xarray->begin(),
1539:             thrust::make_constant_iterator(alpha),
1540:             yarray->begin(),
1541:             thrust::make_constant_iterator(beta))),
1542:         thrust::make_zip_iterator(
1543:           thrust::make_tuple(
1544:             zarray->end(),
1545:             xarray->end(),
1546:             thrust::make_constant_iterator(alpha),
1547:             yarray->end(),
1548:             thrust::make_constant_iterator(beta))),
1549:         VecCUSPAXPBYPZ());
1550:     } catch(char *ex) {
1551:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1552:     }
1553:     PetscLogFlops(4.0*n);
1554:   } else {
1555:     try {
1556:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1557:     } catch(char *ex) {
1558:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1559:     }
1560:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1561:     VecCUSPRestoreArrayRead(xin,&xarray);
1562:     VecCUSPRestoreArrayRead(yin,&yarray);
1563:     PetscLogFlops(5.0*n);
1564:   }
1565:   WaitForGPU();CHKERRCUSP(ierr);
1566:   return(0);
1567: }

1571: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1572: {
1574:   PetscInt       n = win->map->n;
1575:   CUSPARRAY      *xarray,*yarray,*warray;

1578:   VecCUSPGetArrayRead(xin,&xarray);
1579:   VecCUSPGetArrayRead(yin,&yarray);
1580:   VecCUSPGetArrayReadWrite(win,&warray);
1581:   try {
1582:     cusp::blas::xmy(*xarray,*yarray,*warray);
1583:   } catch(char *ex) {
1584:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1585:   }
1586:   VecCUSPRestoreArrayRead(xin,&xarray);
1587:   VecCUSPRestoreArrayRead(yin,&yarray);
1588:   VecCUSPRestoreArrayReadWrite(win,&warray);
1589:   PetscLogFlops(n);
1590:   WaitForGPU();CHKERRCUSP(ierr);
1591:   return(0);
1592: }


1595: /* should do infinity norm in cusp */

1599: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1600: {
1601:   const PetscScalar *xx;
1602:   PetscErrorCode    ierr;
1603:   PetscInt          n = xin->map->n;
1604:   PetscBLASInt      one = 1, bn;
1605:   CUSPARRAY         *xarray;

1608:   PetscBLASIntCast(n,&bn);
1609:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1610:     VecCUSPGetArrayRead(xin,&xarray);
1611:     try {
1612:       *z = cusp::blas::nrm2(*xarray);
1613:     } catch(char *ex) {
1614:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1615:     }
1616:     WaitForGPU();CHKERRCUSP(ierr);
1617:     VecCUSPRestoreArrayRead(xin,&xarray);
1618:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1619:   } else if (type == NORM_INFINITY) {
1620:     PetscInt  i;
1621:     PetscReal max = 0.0,tmp;

1623:     VecGetArrayRead(xin,&xx);
1624:     for (i=0; i<n; i++) {
1625:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1626:       /* check special case of tmp == NaN */
1627:       if (tmp != tmp) {max = tmp; break;}
1628:       xx++;
1629:     }
1630:     VecRestoreArrayRead(xin,&xx);
1631:     *z   = max;
1632:   } else if (type == NORM_1) {
1633:     VecCUSPGetArrayRead(xin,&xarray);
1634: #if defined(PETSC_USE_COMPLEX)
1635: #if defined(PETSC_USE_REAL_SINGLE)
1636:     *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1637: #else
1638:     *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1639: #endif
1640: #else
1641: #if defined(PETSC_USE_REAL_SINGLE)
1642:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1643: #else
1644:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1645: #endif
1646: #endif
1647:     cublasGetError();CHKERRCUSP(ierr);
1648:     VecCUSPRestoreArrayRead(xin,&xarray);
1649:     WaitForGPU();CHKERRCUSP(ierr);
1650:     PetscLogFlops(PetscMax(n-1.0,0.0));
1651:   } else if (type == NORM_1_AND_2) {
1652:     VecNorm_SeqCUSP(xin,NORM_1,z);
1653:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1654:   }
1655:   return(0);
1656: }


1659: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1663: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1664: {

1668:   VecSetRandom_SeqCUSP_Private(xin,r);
1669:   xin->valid_GPU_array = PETSC_CUSP_CPU;
1670:   return(0);
1671: }

1675: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1676: {

1680:   VecCUSPCopyFromGPU(vin);
1681:   VecResetArray_SeqCUSP_Private(vin);
1682:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1683:   return(0);
1684: }

1688: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1689: {

1693:   VecCUSPCopyFromGPU(vin);
1694:   VecPlaceArray_Seq(vin,a);
1695:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1696:   return(0);
1697: }


1702: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1703: {

1707:   VecCUSPCopyFromGPU(vin);
1708:   VecReplaceArray_Seq(vin,a);
1709:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1710:   return(0);
1711: }


1716: /*@
1717:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1719:    Collective on MPI_Comm

1721:    Input Parameter:
1722: +  comm - the communicator, should be PETSC_COMM_SELF
1723: -  n - the vector length

1725:    Output Parameter:
1726: .  V - the vector

1728:    Notes:
1729:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1730:    same type as an existing vector.

1732:    Level: intermediate

1734:    Concepts: vectors^creating sequential

1736: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1737: @*/
1738: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1739: {

1743:   VecCreate(comm,v);
1744:   VecSetSizes(*v,n,n);
1745:   VecSetType(*v,VECSEQCUSP);
1746:   return(0);
1747: }

1749: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1750: template <typename T>
1751: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1752: {
1753:   __host__ __device__
1754:   T operator()(T x)
1755:   {
1756: #if defined(PETSC_USE_COMPLEX)
1757:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1758: #else
1759:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1760: #endif
1761:   }
1762: };

1764: template <typename T>
1765: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1766: {
1767:   __host__ __device__
1768:   T operator()(T x,T y)
1769:   {
1770:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1771:   }
1772: };

1776: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1777: {
1778:   PetscErrorCode                         ierr;
1779:   PetscScalar                            zero = 0.0;
1780:   PetscReal                              n=s->map->n;
1781:   thrust::tuple<PetscScalar,PetscScalar> result;
1782:   CUSPARRAY                              *sarray,*tarray;

1785:   /*VecCUSPCopyToGPU(s);
1786:    VecCUSPCopyToGPU(t);*/
1787:   VecCUSPGetArrayRead(s,&sarray);
1788:   VecCUSPGetArrayRead(t,&tarray);
1789:   try {
1790: #if defined(PETSC_USE_COMPLEX)
1791:     VecDot_SeqCUSP(s,t,dp);
1792:     VecDot_SeqCUSP(t,t,nm);
1793:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1794:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1795: #else
1796:     result = thrust::transform_reduce(
1797:               thrust::make_zip_iterator(
1798:                 thrust::make_tuple(
1799:                   sarray->begin(),
1800:                   tarray->begin())),
1801:               thrust::make_zip_iterator(
1802:                 thrust::make_tuple(
1803:                   sarray->end(),
1804:                   tarray->end())),
1805:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1806:               thrust::make_tuple(zero,zero),                                   /*init */
1807:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1808:     *dp = thrust::get<0>(result);
1809:     *nm = thrust::get<1>(result);
1810: #endif
1811:   } catch(char *ex) {
1812:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1813:   }
1814:   VecCUSPRestoreArrayRead(s,&sarray);
1815:   VecCUSPRestoreArrayRead(t,&tarray);
1816:   WaitForGPU();CHKERRCUSP(ierr);
1817:   PetscLogFlops(4.0*n);
1818:   return(0);
1819: }

1823: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1824: {

1828:   VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1829:   PetscLayoutReference(win->map,&(*V)->map);
1830:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1831:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1832:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1833:   return(0);
1834: }

1838: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1839: {
1841:   Vec_Seq        *s = (Vec_Seq*)v->data;
1842:   cudaError_t    err;
1844:   try {
1845:     if (v->spptr) {
1846:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
1847:       err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);

1849:       /* If the host array has been registered as (page-locked) mapped,
1850:          one must unregister the buffer */
1851:       if (((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked) {
1852:         err = cudaHostUnregister(s->array);CHKERRCUSP(err);
1853:       }
1854:       delete (Vec_CUSP*) v->spptr;
1855:     }
1856:   } catch(char *ex) {
1857:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1858:   }
1859:   VecDestroy_SeqCUSP_Private(v);
1860:   return(0);
1861: }


1864: #if defined(PETSC_USE_COMPLEX)
1865: struct conjugate 
1866: {
1867:   __host__ __device__
1868:   PetscScalar operator()(PetscScalar x)
1869:   {
1870:     return cusp::conj(x);
1871:   }
1872: };
1873: #endif


1878: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1879: {
1881:   CUSPARRAY      *xarray;

1884:   VecCUSPGetArrayReadWrite(xin,&xarray);
1885: #if defined(PETSC_USE_COMPLEX)
1886:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1887: #endif
1888:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
1889:   return(0);
1890: }

1894: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1895: {
1897:   PetscMPIInt    size;

1900:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1901:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1902:   VecCreate_Seq_Private(V,0);
1903:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);

1905:   V->ops->dot             = VecDot_SeqCUSP;
1906:   V->ops->norm            = VecNorm_SeqCUSP;
1907:   V->ops->tdot            = VecTDot_SeqCUSP;
1908:   V->ops->scale           = VecScale_SeqCUSP;
1909:   V->ops->copy            = VecCopy_SeqCUSP;
1910:   V->ops->set             = VecSet_SeqCUSP;
1911:   V->ops->swap            = VecSwap_SeqCUSP;
1912:   V->ops->axpy            = VecAXPY_SeqCUSP;
1913:   V->ops->axpby           = VecAXPBY_SeqCUSP;
1914:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqCUSP;
1915:   V->ops->pointwisemult   = VecPointwiseMult_SeqCUSP;
1916:   V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1917:   V->ops->setrandom       = VecSetRandom_SeqCUSP;
1918:   V->ops->dot_local       = VecDot_SeqCUSP;
1919:   V->ops->tdot_local      = VecTDot_SeqCUSP;
1920:   V->ops->norm_local      = VecNorm_SeqCUSP;
1921:   V->ops->mdot_local      = VecMDot_SeqCUSP;
1922:   V->ops->maxpy           = VecMAXPY_SeqCUSP;
1923:   V->ops->mdot            = VecMDot_SeqCUSP;
1924:   V->ops->aypx            = VecAYPX_SeqCUSP;
1925:   V->ops->waxpy           = VecWAXPY_SeqCUSP;
1926:   V->ops->dotnorm2        = VecDotNorm2_SeqCUSP;
1927:   V->ops->placearray      = VecPlaceArray_SeqCUSP;
1928:   V->ops->replacearray    = VecReplaceArray_SeqCUSP;
1929:   V->ops->resetarray      = VecResetArray_SeqCUSP;
1930:   V->ops->destroy         = VecDestroy_SeqCUSP;
1931:   V->ops->duplicate       = VecDuplicate_SeqCUSP;
1932:   V->ops->conjugate       = VecConjugate_SeqCUSP;

1934:   VecCUSPAllocateCheck(V);
1935:   V->valid_GPU_array      = PETSC_CUSP_GPU;
1936:   VecSet(V,0.0);
1937:   return(0);
1938: }

1942: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1943: {

1947:   *a   = 0;
1948:   VecCUSPCopyToGPU(v);
1949:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1950:   return(0);
1951: }

1955: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1956: {

1960:   v->valid_GPU_array = PETSC_CUSP_GPU;

1962:   PetscObjectStateIncrease((PetscObject)v);
1963:   return(0);
1964: }

1968: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1969: {

1973:   *a   = 0;
1974:   VecCUSPCopyToGPU(v);
1975:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1976:   return(0);
1977: }

1981: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1982: {
1984:   return(0);
1985: }

1989: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1990: {

1994:   *a   = 0;
1995:   VecCUSPAllocateCheck(v);
1996:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1997:   return(0);
1998: }

2002: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2003: {

2007:   v->valid_GPU_array = PETSC_CUSP_GPU;

2009:   PetscObjectStateIncrease((PetscObject)v);
2010:   return(0);
2011: }