Actual source code: veccusp.cu

petsc-dev 2014-02-02
Report Typos and Errors
  1: /*
  2:    Implements the sequential cusp vectors.
  3: */

  5: #include <petscconf.h>
  6: PETSC_CUDA_EXTERN_C_BEGIN
  7: #include <petsc-private/vecimpl.h>          /*I "petscvec.h" I*/
  8: #include <../src/vec/vec/impls/dvecimpl.h>
  9: PETSC_CUDA_EXTERN_C_END
 10: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 12: #include <cuda_runtime.h>

 16: /*
 17:     Allocates space for the vector array on the Host if it does not exist.
 18:     Does NOT change the PetscCUSPFlag for the vector
 19:     Does NOT zero the CUSP array
 20:  */
 21: PetscErrorCode VecCUSPAllocateCheckHost(Vec v)
 22: {
 24:   cudaError_t    err;
 25:   PetscScalar    *array;
 26:   Vec_Seq        *s;
 27:   PetscInt       n = v->map->n;

 30:   s    = (Vec_Seq*)v->data;
 31:   VecCUSPAllocateCheck(v);
 32:   if (s->array == 0) {
 33:     PetscMalloc1(n,&array);
 34:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 35:     s->array           = array;
 36:     s->array_allocated = array;
 37:     err = cudaHostRegister(s->array, n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
 38:     ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
 39:   }
 40:   return(0);
 41: }


 46: /*
 47:     Allocates space for the vector array on the GPU if it does not exist.
 48:     Does NOT change the PetscCUSPFlag for the vector
 49:     Does NOT zero the CUSP array

 51:  */
 52: PetscErrorCode VecCUSPAllocateCheck(Vec v)
 53: {
 54:   cudaError_t    err;
 55:   cudaStream_t   stream;
 56:   Vec_Seq        *s = (Vec_Seq*)v->data;

 59:   // First allocate memory on the GPU if needed
 60:   if (!v->spptr) {
 61:     try {
 62:       v->spptr                        = new Vec_CUSP;
 63:       ((Vec_CUSP*)v->spptr)->GPUarray = new CUSPARRAY;
 64:       ((Vec_CUSP*)v->spptr)->GPUarray->resize((PetscBLASInt)v->map->n);
 65:       err = cudaStreamCreate(&stream);CHKERRCUSP(err);
 66:       ((Vec_CUSP*)v->spptr)->stream = stream;

 68:       ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_FALSE;
 69:       /* If the array is already allocated, one can register it as (page-locked) mapped.
 70:          This can substantially accelerate data transfer across the PCI Express */
 71:       if (s->array) {
 72:         err = cudaHostRegister(s->array, v->map->n*sizeof(PetscScalar),cudaHostRegisterMapped);CHKERRCUSP(err);
 73:         ((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked = PETSC_TRUE;
 74:       }
 75:       v->ops->destroy = VecDestroy_SeqCUSP;
 76:     } catch(char *ex) {
 77:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
 78:     }
 79:   }
 80:   return(0);
 81: }


 86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 87: PetscErrorCode VecCUSPCopyToGPU(Vec v)
 88: {
 90:   cudaError_t    err;
 91:   Vec_CUSP       *veccusp;
 92:   CUSPARRAY      *varray;
 93:   cudaStream_t   stream;

 96:   VecCUSPAllocateCheck(v);
 97:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
 98:     PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);
 99:     try {
100:       veccusp=(Vec_CUSP*)v->spptr;
101:       varray=veccusp->GPUarray;
102:       stream=veccusp->stream;
103:       err = cudaMemcpyAsync(varray->data().get(), *(PetscScalar**)v->data, v->map->n*sizeof(PetscScalar),
104:                              cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
105:       err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
106:     } catch(char *ex) {
107:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
108:     }
109:     PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);
110:     v->valid_GPU_array = PETSC_CUSP_BOTH;
111:   }
112:   return(0);
113: }

117: static PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci)
118: {
119:   CUSPARRAY      *varray;
121:   cudaError_t    err;
122:   PetscScalar    *cpuPtr, *gpuPtr;
123:   cudaStream_t   stream;
124:   Vec_Seq        *s;
125:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

128:   VecCUSPAllocateCheck(v);
129:   if (v->valid_GPU_array == PETSC_CUSP_CPU) {
130:     stream=((Vec_CUSP*)v->spptr)->stream;
131:     s = (Vec_Seq*)v->data;

133:     PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);
134:     varray = ((Vec_CUSP*)v->spptr)->GPUarray;
135:     gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex;
136:     cpuPtr = s->array + ptop_scatter->recvLowestIndex;

138:     /* Note : this code copies the smallest contiguous chunk of data
139:        containing ALL of the indices */
140:     err = cudaMemcpyAsync(gpuPtr, cpuPtr, ptop_scatter->nr*sizeof(PetscScalar),
141:                            cudaMemcpyHostToDevice, stream);CHKERRCUSP(err);
142:     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);

144: #if 0
145:     Vec_Seq *s;
146:     s = (Vec_Seq*)v->data;

148:     CUSPINTARRAYCPU *indicesCPU=&ci->recvIndicesCPU;
149:     CUSPINTARRAYGPU *indicesGPU=&ci->recvIndicesGPU;

151:     thrust::copy(thrust::make_permutation_iterator(s->array,indicesCPU->begin()),
152:                  thrust::make_permutation_iterator(s->array,indicesCPU->end()),
153:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()));
154: #endif
155:     // Set the buffer states
156:     v->valid_GPU_array = PETSC_CUSP_BOTH;
157:     PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);
158:   }
159:   return(0);
160: }


165: /*
166:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
167: */
168: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
169: {
171:   cudaError_t    err;
172:   Vec_CUSP       *veccusp;
173:   CUSPARRAY      *varray;
174:   cudaStream_t   stream;

177:   VecCUSPAllocateCheckHost(v);
178:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
179:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
180:     try {
181:       veccusp=(Vec_CUSP*)v->spptr;
182:       varray=veccusp->GPUarray;
183:       stream=veccusp->stream;

185:       err = cudaMemcpyAsync(*(PetscScalar**)v->data, varray->data().get(), v->map->n*sizeof(PetscScalar),
186:                              cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
187:       err = cudaStreamSynchronize(stream);CHKERRCUSP(err);
188:     } catch(char *ex) {
189:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
190:     }
191:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
192:     v->valid_GPU_array = PETSC_CUSP_BOTH;
193:   }
194:   return(0);
195: }

199: /* Note that this function only copies *some* of the values up from the GPU to CPU,
200:    which means that we need recombine the data at some point before using any of the standard functions.
201:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
202:    where you have to always call in pairs
203: */
204: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci)
205: {
206:   CUSPARRAY      *varray;
208:   cudaError_t    err;
209:   PetscScalar    *cpuPtr, *gpuPtr;
210:   cudaStream_t   stream;
211:   Vec_Seq        *s;
212:   VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter;

215:   VecCUSPAllocateCheckHost(v);
216:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
217:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);

219:     stream=((Vec_CUSP*)v->spptr)->stream;
220:     varray=((Vec_CUSP*)v->spptr)->GPUarray;
221:     s = (Vec_Seq*)v->data;
222:     gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex;
223:     cpuPtr = s->array + ptop_scatter->sendLowestIndex;

225:     /* Note : this code copies the smallest contiguous chunk of data
226:        containing ALL of the indices */
227:     err = cudaMemcpyAsync(cpuPtr, gpuPtr, ptop_scatter->ns*sizeof(PetscScalar),
228:                            cudaMemcpyDeviceToHost, stream);CHKERRCUSP(err);
229:     err = cudaStreamSynchronize(stream);CHKERRCUSP(err);

231: #if 0
232:     Vec_Seq *s;
233:     s = (Vec_Seq*)v->data;
234:     CUSPINTARRAYCPU *indicesCPU=&ci->sendIndicesCPU;
235:     CUSPINTARRAYGPU *indicesGPU=&ci->sendIndicesGPU;

237:     thrust::copy(thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
238:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
239:                  thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
240: #endif
241:     VecCUSPRestoreArrayRead(v,&varray);
242:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
243:     v->valid_GPU_array = PETSC_CUSP_BOTH;
244:   }
245:   return(0);
246: }

250: static PetscErrorCode VecCopy_SeqCUSP_Private(Vec xin,Vec yin)
251: {
252:   PetscScalar       *ya;
253:   const PetscScalar *xa;
254:   PetscErrorCode    ierr;

257:   VecCUSPAllocateCheckHost(xin);
258:   VecCUSPAllocateCheckHost(yin);
259:   if (xin != yin) {
260:     VecGetArrayRead(xin,&xa);
261:     VecGetArray(yin,&ya);
262:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
263:     VecRestoreArrayRead(xin,&xa);
264:     VecRestoreArray(yin,&ya);
265:   }
266:   return(0);
267: }

271: static PetscErrorCode VecSetRandom_SeqCUSP_Private(Vec xin,PetscRandom r)
272: {
274:   PetscInt       n = xin->map->n,i;
275:   PetscScalar    *xx;

278:   VecGetArray(xin,&xx);
279:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
280:   VecRestoreArray(xin,&xx);
281:   return(0);
282: }

286: static PetscErrorCode VecDestroy_SeqCUSP_Private(Vec v)
287: {
288:   Vec_Seq        *vs = (Vec_Seq*)v->data;

292:   PetscObjectSAWsViewOff(v);
293: #if defined(PETSC_USE_LOG)
294:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
295: #endif
296:   if (vs->array_allocated) PetscFree(vs->array_allocated);
297:   PetscFree(vs);
298:   return(0);
299: }

303: static PetscErrorCode VecResetArray_SeqCUSP_Private(Vec vin)
304: {
305:   Vec_Seq *v = (Vec_Seq*)vin->data;

308:   v->array         = v->unplacedarray;
309:   v->unplacedarray = 0;
310:   return(0);
311: }

313: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
316: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
317: {

321:   VecCUSPAllocateCheck(v);
322:   return(0);
323: }

327: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
328: {

332:   VecCUSPCopyToGPU(v);
333:   return(0);
334: }



340: /*
341:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

343:    Input Parameters:
344: +    v - the vector
345: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

347: */
348: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
349: {

353:   VecCUSPCopyToGPUSome(v,ci);
354:   return(0);
355: }

359: /*
360:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

362:   Input Parameters:
363:  +    v - the vector
364:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
365: */
366: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
367: {

371:   VecCUSPCopyFromGPUSome(v,ci);
372:   return(0);
373: }

375: /*MC
376:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

378:    Options Database Keys:
379: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

381:   Level: beginner

383: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
384: M*/

386: /* for VecAYPX_SeqCUSP*/
387: namespace cusp
388: {
389: namespace blas
390: {
391: namespace detail
392: {
393:   template <typename T>
394:     struct AYPX : public thrust::binary_function<T,T,T>
395:     {
396:       T alpha;

398:       AYPX(T _alpha) : alpha(_alpha) {}

400:       __host__ __device__
401:       T operator()(T x, T y)
402:       {
403:         return alpha * y + x;
404:       }
405:     };
406: }

408:  template <typename ForwardIterator1,
409:            typename ForwardIterator2,
410:            typename ScalarType>
411: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
412:            {
413:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
414:            }
415:  template <typename Array1, typename Array2, typename ScalarType>
416:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
417:  {
418:    detail::assert_same_dimensions(x,y);
419:    aypx(x.begin(),x.end(),y.begin(),alpha);
420:  }
421: }
422: }

426: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
427: {
428:   CUSPARRAY      *xarray,*yarray;

432:   if (alpha != 0.0) {
433:     VecCUSPGetArrayRead(xin,&xarray);
434:     VecCUSPGetArrayReadWrite(yin,&yarray);
435:     try {
436:       cusp::blas::aypx(*xarray,*yarray,alpha);
437:       WaitForGPU();CHKERRCUSP(ierr);
438:     } catch(char *ex) {
439:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
440:     }
441:     VecCUSPRestoreArrayRead(xin,&xarray);
442:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
443:     PetscLogFlops(2.0*yin->map->n);
444:   }
445:   return(0);
446: }


451: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
452: {
453:   CUSPARRAY      *xarray,*yarray;

457:   if (alpha != 0.0) {
458:     VecCUSPGetArrayRead(xin,&xarray);
459:     VecCUSPGetArrayReadWrite(yin,&yarray);
460:     try {
461:       cusp::blas::axpy(*xarray,*yarray,alpha);
462:       WaitForGPU();CHKERRCUSP(ierr);
463:     } catch(char *ex) {
464:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
465:     }
466:     VecCUSPRestoreArrayRead(xin,&xarray);
467:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
468:     PetscLogFlops(2.0*yin->map->n);
469:   }
470:   return(0);
471: }

473: struct VecCUSPPointwiseDivide
474: {
475:   template <typename Tuple>
476:   __host__ __device__
477:   void operator()(Tuple t)
478:   {
479:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
480:   }
481: };

485: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
486: {
487:   CUSPARRAY      *warray=NULL,*xarray=NULL,*yarray=NULL;

491:   VecCUSPGetArrayRead(xin,&xarray);
492:   VecCUSPGetArrayRead(yin,&yarray);
493:   VecCUSPGetArrayWrite(win,&warray);
494:   try {
495:     thrust::for_each(
496:       thrust::make_zip_iterator(
497:         thrust::make_tuple(
498:           warray->begin(),
499:           xarray->begin(),
500:           yarray->begin())),
501:       thrust::make_zip_iterator(
502:         thrust::make_tuple(
503:           warray->end(),
504:           xarray->end(),
505:           yarray->end())),
506:       VecCUSPPointwiseDivide());
507:     WaitForGPU();CHKERRCUSP(ierr);
508:   } catch(char *ex) {
509:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
510:   }
511:   PetscLogFlops(win->map->n);
512:   VecCUSPRestoreArrayRead(xin,&xarray);
513:   VecCUSPRestoreArrayRead(yin,&yarray);
514:   VecCUSPRestoreArrayWrite(win,&warray);
515:   return(0);
516: }


519: struct VecCUSPWAXPY
520: {
521:   template <typename Tuple>
522:   __host__ __device__
523:   void operator()(Tuple t)
524:   {
525:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
526:   }
527: };

529: struct VecCUSPSum
530: {
531:   template <typename Tuple>
532:   __host__ __device__
533:   void operator()(Tuple t)
534:   {
535:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
536:   }
537: };

539: struct VecCUSPDiff
540: {
541:   template <typename Tuple>
542:   __host__ __device__
543:   void operator()(Tuple t)
544:   {
545:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
546:   }
547: };

551: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
552: {
553:   CUSPARRAY      *xarray=NULL,*yarray=NULL,*warray=NULL;

557:   if (alpha == 0.0) {
558:     VecCopy_SeqCUSP(yin,win);
559:   } else {
560:     VecCUSPGetArrayRead(xin,&xarray);
561:     VecCUSPGetArrayRead(yin,&yarray);
562:     VecCUSPGetArrayWrite(win,&warray);
563:     if (alpha == 1.0) {
564:       try {
565:         thrust::for_each(
566:           thrust::make_zip_iterator(
567:             thrust::make_tuple(
568:               warray->begin(),
569:               yarray->begin(),
570:               xarray->begin())),
571:           thrust::make_zip_iterator(
572:             thrust::make_tuple(
573:               warray->end(),
574:               yarray->end(),
575:               xarray->end())),
576:           VecCUSPSum());
577:       } catch(char *ex) {
578:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
579:       }
580:       PetscLogFlops(win->map->n);
581:     } else if (alpha == -1.0) {
582:       try {
583:         thrust::for_each(
584:           thrust::make_zip_iterator(
585:             thrust::make_tuple(
586:               warray->begin(),
587:               yarray->begin(),
588:               xarray->begin())),
589:           thrust::make_zip_iterator(
590:             thrust::make_tuple(
591:               warray->end(),
592:               yarray->end(),
593:               xarray->end())),
594:           VecCUSPDiff());
595:       } catch(char *ex) {
596:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
597:       }
598:       PetscLogFlops(win->map->n);
599:     } else {
600:       try {
601:         thrust::for_each(
602:           thrust::make_zip_iterator(
603:             thrust::make_tuple(
604:               warray->begin(),
605:               yarray->begin(),
606:               thrust::make_constant_iterator(alpha),
607:               xarray->begin())),
608:           thrust::make_zip_iterator(
609:             thrust::make_tuple(
610:               warray->end(),
611:               yarray->end(),
612:               thrust::make_constant_iterator(alpha),
613:               xarray->end())),
614:           VecCUSPWAXPY());
615:       } catch(char *ex) {
616:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
617:       }
618:       PetscLogFlops(2*win->map->n);
619:     }
620:     WaitForGPU();CHKERRCUSP(ierr);
621:     VecCUSPRestoreArrayRead(xin,&xarray);
622:     VecCUSPRestoreArrayRead(yin,&yarray);
623:     VecCUSPRestoreArrayWrite(win,&warray);
624:   }
625:   return(0);
626: }

628: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
629: struct VecCUSPMAXPY4
630: {
631:   template <typename Tuple>
632:   __host__ __device__
633:   void operator()(Tuple t)
634:   {
635:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
636:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
637:   }
638: };


641: struct VecCUSPMAXPY3
642: {
643:   template <typename Tuple>
644:   __host__ __device__
645:   void operator()(Tuple t)
646:   {
647:     /*y += a1*x1 +a2*x2 + a3*x3 */
648:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
649:   }
650: };

652: struct VecCUSPMAXPY2
653: {
654:   template <typename Tuple>
655:   __host__ __device__
656:   void operator()(Tuple t)
657:   {
658:     /*y += a1*x1 +a2*x2*/
659:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
660:   }
661: };
664: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
665: {
667:   CUSPARRAY      *xarray,*yy0,*yy1,*yy2,*yy3;
668:   PetscInt       n = xin->map->n,j,j_rem;
669:   PetscScalar    alpha0,alpha1,alpha2,alpha3;

672:   PetscLogFlops(nv*2.0*n);
673:   VecCUSPGetArrayReadWrite(xin,&xarray);
674:   switch (j_rem=nv&0x3) {
675:   case 3:
676:     alpha0 = alpha[0];
677:     alpha1 = alpha[1];
678:     alpha2 = alpha[2];
679:     alpha += 3;
680:     VecCUSPGetArrayRead(y[0],&yy0);
681:     VecCUSPGetArrayRead(y[1],&yy1);
682:     VecCUSPGetArrayRead(y[2],&yy2);
683:     try {
684:       thrust::for_each(
685:         thrust::make_zip_iterator(
686:           thrust::make_tuple(
687:             xarray->begin(),
688:             thrust::make_constant_iterator(alpha0),
689:             yy0->begin(),
690:             thrust::make_constant_iterator(alpha1),
691:             yy1->begin(),
692:             thrust::make_constant_iterator(alpha2),
693:             yy2->begin())),
694:         thrust::make_zip_iterator(
695:           thrust::make_tuple(
696:             xarray->end(),
697:             thrust::make_constant_iterator(alpha0),
698:             yy0->end(),
699:             thrust::make_constant_iterator(alpha1),
700:             yy1->end(),
701:             thrust::make_constant_iterator(alpha2),
702:             yy2->end())),
703:         VecCUSPMAXPY3());
704:     } catch(char *ex) {
705:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
706:     }
707:     VecCUSPRestoreArrayRead(y[0],&yy0);
708:     VecCUSPRestoreArrayRead(y[1],&yy1);
709:     VecCUSPRestoreArrayRead(y[2],&yy2);
710:     y   += 3;
711:     break;
712:   case 2:
713:     alpha0 = alpha[0];
714:     alpha1 = alpha[1];
715:     alpha +=2;
716:     VecCUSPGetArrayRead(y[0],&yy0);
717:     VecCUSPGetArrayRead(y[1],&yy1);
718:     try {
719:       thrust::for_each(
720:         thrust::make_zip_iterator(
721:           thrust::make_tuple(
722:             xarray->begin(),
723:             thrust::make_constant_iterator(alpha0),
724:             yy0->begin(),
725:             thrust::make_constant_iterator(alpha1),
726:             yy1->begin())),
727:         thrust::make_zip_iterator(
728:           thrust::make_tuple(
729:             xarray->end(),
730:             thrust::make_constant_iterator(alpha0),
731:             yy0->end(),
732:             thrust::make_constant_iterator(alpha1),
733:             yy1->end())),
734:         VecCUSPMAXPY2());
735:     } catch(char *ex) {
736:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
737:     }
738:     y +=2;
739:     break;
740:   case 1:
741:     alpha0 = *alpha++;
742:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
743:     y     +=1;
744:     break;
745:   }
746:   for (j=j_rem; j<nv; j+=4) {
747:     alpha0 = alpha[0];
748:     alpha1 = alpha[1];
749:     alpha2 = alpha[2];
750:     alpha3 = alpha[3];
751:     alpha += 4;
752:     VecCUSPGetArrayRead(y[0],&yy0);
753:     VecCUSPGetArrayRead(y[1],&yy1);
754:     VecCUSPGetArrayRead(y[2],&yy2);
755:     VecCUSPGetArrayRead(y[3],&yy3);
756:     try {
757:       thrust::for_each(
758:         thrust::make_zip_iterator(
759:           thrust::make_tuple(
760:             xarray->begin(),
761:             thrust::make_constant_iterator(alpha0),
762:             yy0->begin(),
763:             thrust::make_constant_iterator(alpha1),
764:             yy1->begin(),
765:             thrust::make_constant_iterator(alpha2),
766:             yy2->begin(),
767:             thrust::make_constant_iterator(alpha3),
768:             yy3->begin())),
769:         thrust::make_zip_iterator(
770:           thrust::make_tuple(
771:             xarray->end(),
772:             thrust::make_constant_iterator(alpha0),
773:             yy0->end(),
774:             thrust::make_constant_iterator(alpha1),
775:             yy1->end(),
776:             thrust::make_constant_iterator(alpha2),
777:             yy2->end(),
778:             thrust::make_constant_iterator(alpha3),
779:             yy3->end())),
780:         VecCUSPMAXPY4());
781:     } catch(char *ex) {
782:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
783:     }
784:     VecCUSPRestoreArrayRead(y[0],&yy0);
785:     VecCUSPRestoreArrayRead(y[1],&yy1);
786:     VecCUSPRestoreArrayRead(y[2],&yy2);
787:     VecCUSPRestoreArrayRead(y[3],&yy3);
788:     y   += 4;
789:   }
790:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
791:   WaitForGPU();CHKERRCUSP(ierr);
792:   return(0);
793: }


798: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
799: {
800:   CUSPARRAY      *xarray,*yarray;
802:   //  PetscScalar    *xptr,*yptr,*zgpu;
803:   //PetscReal tmp;

806:   //VecNorm_SeqCUSP(xin, NORM_2, &tmp);
807:   //VecNorm_SeqCUSP(yin, NORM_2, &tmp);
808:   VecCUSPGetArrayRead(xin,&xarray);
809:   VecCUSPGetArrayRead(yin,&yarray);
810:   try {
811: #if defined(PETSC_USE_COMPLEX)
812:     *z = cusp::blas::dotc(*yarray,*xarray);
813: #else
814:     *z = cusp::blas::dot(*yarray,*xarray);
815: #endif
816:   } catch(char *ex) {
817:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
818:   }
819:   WaitForGPU();CHKERRCUSP(ierr);
820:   if (xin->map->n >0) {
821:     PetscLogFlops(2.0*xin->map->n-1);
822:   }
823:   VecCUSPRestoreArrayRead(xin,&xarray);
824:   VecCUSPRestoreArrayRead(yin,&yarray);
825:   return(0);
826: }

828: //
829: // CUDA kernels for MDot to follow
830: //

832: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
833: #define MDOT_WORKGROUP_SIZE 128
834: #define MDOT_WORKGROUP_NUM  128

836: // M = 2:
837: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
838:                                         PetscInt size, PetscScalar *group_results)
839: {
840:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
841:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
842:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
843:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
844:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

846:   PetscScalar entry_x    = 0;
847:   PetscScalar group_sum0 = 0;
848:   PetscScalar group_sum1 = 0;
849:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
850:     entry_x     = x[i];   // load only once from global memory!
851:     group_sum0 += entry_x * y0[i];
852:     group_sum1 += entry_x * y1[i];
853:   }
854:   tmp_buffer[threadIdx.x]                       = group_sum0;
855:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

857:   // parallel reduction
858:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
859:     __syncthreads();
860:     if (threadIdx.x < stride) {
861:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
862:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
863:     }
864:   }

866:   // write result of group to group_results
867:   if (threadIdx.x == 0) {
868:     group_results[blockIdx.x]             = tmp_buffer[0];
869:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
870:   }
871: }

873: // M = 3:
874: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
875:                                         PetscInt size, PetscScalar *group_results)
876: {
877:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
878:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
879:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
880:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
881:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

883:   PetscScalar entry_x    = 0;
884:   PetscScalar group_sum0 = 0;
885:   PetscScalar group_sum1 = 0;
886:   PetscScalar group_sum2 = 0;
887:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
888:     entry_x     = x[i];   // load only once from global memory!
889:     group_sum0 += entry_x * y0[i];
890:     group_sum1 += entry_x * y1[i];
891:     group_sum2 += entry_x * y2[i];
892:   }
893:   tmp_buffer[threadIdx.x]                           = group_sum0;
894:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
895:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

897:   // parallel reduction
898:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
899:     __syncthreads();
900:     if (threadIdx.x < stride) {
901:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
902:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
903:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
904:     }
905:   }

907:   // write result of group to group_results
908:   if (threadIdx.x == 0) {
909:     group_results[blockIdx.x                ] = tmp_buffer[0];
910:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
911:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
912:   }
913: }

915: // M = 4:
916: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
917:                                         PetscInt size, PetscScalar *group_results)
918: {
919:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
920:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
921:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
922:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
923:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

925:   PetscScalar entry_x    = 0;
926:   PetscScalar group_sum0 = 0;
927:   PetscScalar group_sum1 = 0;
928:   PetscScalar group_sum2 = 0;
929:   PetscScalar group_sum3 = 0;
930:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
931:     entry_x     = x[i];   // load only once from global memory!
932:     group_sum0 += entry_x * y0[i];
933:     group_sum1 += entry_x * y1[i];
934:     group_sum2 += entry_x * y2[i];
935:     group_sum3 += entry_x * y3[i];
936:   }
937:   tmp_buffer[threadIdx.x]                           = group_sum0;
938:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
939:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
940:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

942:   // parallel reduction
943:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
944:     __syncthreads();
945:     if (threadIdx.x < stride) {
946:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
947:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
948:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
949:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
950:     }
951:   }

953:   // write result of group to group_results
954:   if (threadIdx.x == 0) {
955:     group_results[blockIdx.x                ] = tmp_buffer[0];
956:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
957:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
958:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
959:   }
960: }

962: // M = 8:
963: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
964:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
965:                                           PetscInt size, PetscScalar *group_results)
966: {
967:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
968:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
969:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
970:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
971:   PetscInt vec_stop_index  = min((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

973:   PetscScalar entry_x    = 0;
974:   PetscScalar group_sum0 = 0;
975:   PetscScalar group_sum1 = 0;
976:   PetscScalar group_sum2 = 0;
977:   PetscScalar group_sum3 = 0;
978:   PetscScalar group_sum4 = 0;
979:   PetscScalar group_sum5 = 0;
980:   PetscScalar group_sum6 = 0;
981:   PetscScalar group_sum7 = 0;
982:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
983:     entry_x     = x[i];   // load only once from global memory!
984:     group_sum0 += entry_x * y0[i];
985:     group_sum1 += entry_x * y1[i];
986:     group_sum2 += entry_x * y2[i];
987:     group_sum3 += entry_x * y3[i];
988:     group_sum4 += entry_x * y4[i];
989:     group_sum5 += entry_x * y5[i];
990:     group_sum6 += entry_x * y6[i];
991:     group_sum7 += entry_x * y7[i];
992:   }
993:   tmp_buffer[threadIdx.x]                           = group_sum0;
994:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
995:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
996:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
997:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
998:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
999:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
1000:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

1002:   // parallel reduction
1003:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
1004:     __syncthreads();
1005:     if (threadIdx.x < stride) {
1006:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
1007:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
1008:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
1009:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
1010:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
1011:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
1012:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
1013:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
1014:     }
1015:   }

1017:   // write result of group to group_results
1018:   if (threadIdx.x == 0) {
1019:     group_results[blockIdx.x                ] = tmp_buffer[0];
1020:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
1021:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
1022:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
1023:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
1024:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
1025:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
1026:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
1027:   }
1028: }


1033: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
1034: {
1036:   PetscInt       i,j,n = xin->map->n,current_y_index = 0;
1037:   CUSPARRAY      *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array;
1038:   PetscScalar    *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
1039:   PetscScalar    group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
1040:   cudaError_t    cuda_ierr;

1043:   // allocate scratchpad memory for the results of individual work groups:
1044:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive.");
1045:   cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);
1046:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not allocate CUDA work memory. Error code: %d", (int)cuda_ierr);

1048:   VecCUSPGetArrayRead(xin,&xarray);
1049:   xptr = thrust::raw_pointer_cast(xarray->data());

1051:   while (current_y_index < nv)
1052:   {
1053:     switch (nv - current_y_index) {

1055:     case 7:
1056:     case 6:
1057:     case 5:
1058:     case 4:
1059:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1060:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1061:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1062:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);

1064: #if defined(PETSC_USE_COMPLEX)
1065:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1066:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1067:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1068:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1069: #else
1070:       // extract raw device pointers:
1071:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1072:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1073:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1074:       y3ptr = thrust::raw_pointer_cast(y3array->data());

1076:       // run kernel:
1077:       VecMDot_SeqCUSP_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);

1079:       // copy results back to
1080:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);
1081:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1083:       // sum group results into z:
1084:       for (j=0; j<4; ++j) {
1085:         z[current_y_index + j] = 0;
1086:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1087:       }
1088: #endif
1089:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1090:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1091:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1092:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1093:       current_y_index += 4;
1094:       break;

1096:     case 3:
1097:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1098:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1099:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);

1101: #if defined(PETSC_USE_COMPLEX)
1102:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1103:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1104:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1105: #else
1106:       // extract raw device pointers:
1107:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1108:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1109:       y2ptr = thrust::raw_pointer_cast(y2array->data());

1111:       // run kernel:
1112:       VecMDot_SeqCUSP_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);

1114:       // copy results back to
1115:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);
1116:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1118:       // sum group results into z:
1119:       for (j=0; j<3; ++j) {
1120:         z[current_y_index + j] = 0;
1121:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1122:       }
1123: #endif

1125:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1126:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1127:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1128:       current_y_index += 3;
1129:       break;

1131:     case 2:
1132:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1133:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);

1135: #if defined(PETSC_USE_COMPLEX)
1136:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1137:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1138: #else
1139:       // extract raw device pointers:
1140:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1141:       y1ptr = thrust::raw_pointer_cast(y1array->data());

1143:       // run kernel:
1144:       VecMDot_SeqCUSP_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);

1146:       // copy results back to
1147:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);
1148:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1150:       // sum group results into z:
1151:       for (j=0; j<2; ++j) {
1152:         z[current_y_index + j] = 0;
1153:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1154:       }
1155: #endif
1156:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1157:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1158:       current_y_index += 2;
1159:       break;

1161:     case 1:
1162:       VecCUSPGetArrayRead(yin[current_y_index],&y0array);
1163: #if defined(PETSC_USE_COMPLEX)
1164:       z[current_y_index] = cusp::blas::dotc(*y0array, *xarray);
1165: #else
1166:       z[current_y_index] = cusp::blas::dot(*xarray, *y0array);
1167: #endif
1168:       VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);
1169:       current_y_index += 1;
1170:       break;

1172:     default: // 8 or more vectors left
1173:       VecCUSPGetArrayRead(yin[current_y_index  ],&y0array);
1174:       VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);
1175:       VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);
1176:       VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);
1177:       VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);
1178:       VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);
1179:       VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);
1180:       VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);

1182: #if defined(PETSC_USE_COMPLEX)
1183:       z[current_y_index]   = cusp::blas::dot(*y0array,*xarray);
1184:       z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray);
1185:       z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray);
1186:       z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray);
1187:       z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray);
1188:       z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray);
1189:       z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray);
1190:       z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray);
1191: #else
1192:       // extract raw device pointers:
1193:       y0ptr = thrust::raw_pointer_cast(y0array->data());
1194:       y1ptr = thrust::raw_pointer_cast(y1array->data());
1195:       y2ptr = thrust::raw_pointer_cast(y2array->data());
1196:       y3ptr = thrust::raw_pointer_cast(y3array->data());
1197:       y4ptr = thrust::raw_pointer_cast(y4array->data());
1198:       y5ptr = thrust::raw_pointer_cast(y5array->data());
1199:       y6ptr = thrust::raw_pointer_cast(y6array->data());
1200:       y7ptr = thrust::raw_pointer_cast(y7array->data());

1202:       // run kernel:
1203:       VecMDot_SeqCUSP_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);

1205:       // copy results back to
1206:       cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);
1207:       if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr);

1209:       // sum group results into z:
1210:       for (j=0; j<8; ++j) {
1211:         z[current_y_index + j] = 0;
1212:         for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
1213:       }
1214: #endif
1215:       VecCUSPRestoreArrayRead(yin[current_y_index  ],&y0array);
1216:       VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);
1217:       VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);
1218:       VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);
1219:       VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);
1220:       VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);
1221:       VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);
1222:       VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);
1223:       current_y_index += 8;
1224:       break;
1225:     }
1226:   }
1227:   VecCUSPRestoreArrayRead(xin,&xarray);

1229:   cuda_cudaFree(group_results_gpu);
1230:   if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr);
1231:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
1232:   return(0);
1233: }

1235: #undef MDOT_WORKGROUP_SIZE
1236: #undef MDOT_WORKGROUP_NUM



1242: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
1243: {
1244:   CUSPARRAY      *xarray=NULL;

1248:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
1249:   VecCUSPGetArrayWrite(xin,&xarray);
1250:   try {
1251:     cusp::blas::fill(*xarray,alpha);
1252:   } catch(char *ex) {
1253:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1254:   }
1255:   WaitForGPU();CHKERRCUSP(ierr);
1256:   VecCUSPRestoreArrayWrite(xin,&xarray);
1257:   return(0);
1258: }

1262: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
1263: {
1264:   CUSPARRAY      *xarray;

1268:   if (alpha == 0.0) {
1269:     VecSet_SeqCUSP(xin,alpha);
1270:   } else if (alpha != 1.0) {
1271:     VecCUSPGetArrayReadWrite(xin,&xarray);
1272:     try {
1273:       cusp::blas::scal(*xarray,alpha);
1274:     } catch(char *ex) {
1275:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1276:     }
1277:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1278:   }
1279:   WaitForGPU();CHKERRCUSP(ierr);
1280:   PetscLogFlops(xin->map->n);
1281:   return(0);
1282: }


1287: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
1288: {
1289:   CUSPARRAY      *xarray,*yarray;

1293:   //#if defined(PETSC_USE_COMPLEX)
1294:   /*Not working for complex*/
1295:   //#else
1296:   VecCUSPGetArrayRead(xin,&xarray);
1297:   VecCUSPGetArrayRead(yin,&yarray);
1298:   try {
1299:     *z = cusp::blas::dot(*xarray,*yarray);
1300:   } catch(char *ex) {
1301:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1302:   }
1303:   //#endif
1304:   WaitForGPU();CHKERRCUSP(ierr);
1305:   if (xin->map->n > 0) {
1306:     PetscLogFlops(2.0*xin->map->n-1);
1307:   }
1308:   VecCUSPRestoreArrayRead(yin,&yarray);
1309:   VecCUSPRestoreArrayRead(xin,&xarray);
1310:   return(0);
1311: }
1314: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
1315: {
1316:   CUSPARRAY      *xarray,*yarray;

1320:   if (xin != yin) {
1321:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
1322:       VecCUSPGetArrayRead(xin,&xarray);
1323:       VecCUSPGetArrayWrite(yin,&yarray);
1324:       try {
1325:         cusp::blas::copy(*xarray,*yarray);
1326:       } catch(char *ex) {
1327:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1328:       }
1329:       WaitForGPU();CHKERRCUSP(ierr);
1330:       VecCUSPRestoreArrayRead(xin,&xarray);
1331:       VecCUSPRestoreArrayWrite(yin,&yarray);

1333:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) {
1334:       /* copy in CPU if we are on the CPU*/
1335:       VecCopy_SeqCUSP_Private(xin,yin);
1336:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1337:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1338:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1339:         /* copy in CPU */
1340:         VecCopy_SeqCUSP_Private(xin,yin);

1342:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1343:         /* copy in GPU */
1344:         VecCUSPGetArrayRead(xin,&xarray);
1345:         VecCUSPGetArrayWrite(yin,&yarray);
1346:         try {
1347:           cusp::blas::copy(*xarray,*yarray);
1348:           WaitForGPU();CHKERRCUSP(ierr);
1349:         } catch(char *ex) {
1350:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1351:         }
1352:         VecCUSPRestoreArrayRead(xin,&xarray);
1353:         VecCUSPRestoreArrayWrite(yin,&yarray);
1354:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1355:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1356:            default to copy in GPU (this is an arbitrary choice) */
1357:         VecCUSPGetArrayRead(xin,&xarray);
1358:         VecCUSPGetArrayWrite(yin,&yarray);
1359:         try {
1360:           cusp::blas::copy(*xarray,*yarray);
1361:           WaitForGPU();CHKERRCUSP(ierr);
1362:         } catch(char *ex) {
1363:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1364:         }
1365:         VecCUSPRestoreArrayRead(xin,&xarray);
1366:         VecCUSPRestoreArrayWrite(yin,&yarray);
1367:       } else {
1368:         VecCopy_SeqCUSP_Private(xin,yin);
1369:       }
1370:     }
1371:   }
1372:   return(0);
1373: }


1378: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1379: {
1381:   PetscBLASInt   one = 1,bn;
1382:   CUSPARRAY      *xarray,*yarray;

1385:   PetscBLASIntCast(xin->map->n,&bn);
1386:   if (xin != yin) {
1387:     VecCUSPGetArrayReadWrite(xin,&xarray);
1388:     VecCUSPGetArrayReadWrite(yin,&yarray);

1390: #if defined(PETSC_USE_COMPLEX)
1391: #if defined(PETSC_USE_REAL_SINGLE)
1392:     cublasCswap(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);
1393: #else
1394:     cublasZswap(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);
1395: #endif
1396: #else
1397: #if defined(PETSC_USE_REAL_SINGLE)
1398:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1399: #else
1400:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1401: #endif
1402: #endif
1403:     cublasGetError();CHKERRCUSP(ierr);
1404:     WaitForGPU();CHKERRCUSP(ierr);
1405:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1406:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1407:   }
1408:   return(0);
1409: }

1411: struct VecCUSPAX
1412: {
1413:   template <typename Tuple>
1414:   __host__ __device__
1415:   void operator()(Tuple t)
1416:   {
1417:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1418:   }
1419: };
1422: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1423: {
1425:   PetscScalar    a = alpha,b = beta;
1426:   CUSPARRAY      *xarray,*yarray;

1429:   if (a == 0.0) {
1430:     VecScale_SeqCUSP(yin,beta);
1431:   } else if (b == 1.0) {
1432:     VecAXPY_SeqCUSP(yin,alpha,xin);
1433:   } else if (a == 1.0) {
1434:     VecAYPX_SeqCUSP(yin,beta,xin);
1435:   } else if (b == 0.0) {
1436:     VecCUSPGetArrayRead(xin,&xarray);
1437:     VecCUSPGetArrayReadWrite(yin,&yarray);
1438:     try {
1439:       thrust::for_each(
1440:         thrust::make_zip_iterator(
1441:           thrust::make_tuple(
1442:             yarray->begin(),
1443:             thrust::make_constant_iterator(a),
1444:             xarray->begin())),
1445:         thrust::make_zip_iterator(
1446:           thrust::make_tuple(
1447:             yarray->end(),
1448:             thrust::make_constant_iterator(a),
1449:             xarray->end())),
1450:         VecCUSPAX());
1451:     } catch(char *ex) {
1452:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1453:     }
1454:     PetscLogFlops(xin->map->n);
1455:     WaitForGPU();CHKERRCUSP(ierr);
1456:     VecCUSPRestoreArrayRead(xin,&xarray);
1457:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1458:   } else {
1459:     VecCUSPGetArrayRead(xin,&xarray);
1460:     VecCUSPGetArrayReadWrite(yin,&yarray);
1461:     try {
1462:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1463:     } catch(char *ex) {
1464:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1465:     }
1466:     VecCUSPRestoreArrayRead(xin,&xarray);
1467:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1468:     WaitForGPU();CHKERRCUSP(ierr);
1469:     PetscLogFlops(3.0*xin->map->n);
1470:   }
1471:   return(0);
1472: }

1474: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1475: struct VecCUSPXPBYPCZ
1476: {
1477:   /* z = x + b*y + c*z */
1478:   template <typename Tuple>
1479:   __host__ __device__
1480:   void operator()(Tuple t)
1481:   {
1482:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1483:   }
1484: };
1485: struct VecCUSPAXPBYPZ
1486: {
1487:   /* z = ax + b*y + z */
1488:   template <typename Tuple>
1489:   __host__ __device__
1490:   void operator()(Tuple t)
1491:   {
1492:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1493:   }
1494: };

1498: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1499: {
1501:   PetscInt       n = zin->map->n;
1502:   CUSPARRAY      *xarray,*yarray,*zarray;

1505:   VecCUSPGetArrayRead(xin,&xarray);
1506:   VecCUSPGetArrayRead(yin,&yarray);
1507:   VecCUSPGetArrayReadWrite(zin,&zarray);
1508:   if (alpha == 1.0) {
1509:     try {
1510:       thrust::for_each(
1511:         thrust::make_zip_iterator(
1512:           thrust::make_tuple(
1513:             zarray->begin(),
1514:             thrust::make_constant_iterator(gamma),
1515:             xarray->begin(),
1516:             yarray->begin(),
1517:             thrust::make_constant_iterator(beta))),
1518:         thrust::make_zip_iterator(
1519:           thrust::make_tuple(
1520:             zarray->end(),
1521:             thrust::make_constant_iterator(gamma),
1522:             xarray->end(),
1523:             yarray->end(),
1524:             thrust::make_constant_iterator(beta))),
1525:         VecCUSPXPBYPCZ());
1526:     } catch(char *ex) {
1527:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1528:     }
1529:     PetscLogFlops(4.0*n);
1530:   } else if (gamma == 1.0) {
1531:     try {
1532:       thrust::for_each(
1533:         thrust::make_zip_iterator(
1534:           thrust::make_tuple(
1535:             zarray->begin(),
1536:             xarray->begin(),
1537:             thrust::make_constant_iterator(alpha),
1538:             yarray->begin(),
1539:             thrust::make_constant_iterator(beta))),
1540:         thrust::make_zip_iterator(
1541:           thrust::make_tuple(
1542:             zarray->end(),
1543:             xarray->end(),
1544:             thrust::make_constant_iterator(alpha),
1545:             yarray->end(),
1546:             thrust::make_constant_iterator(beta))),
1547:         VecCUSPAXPBYPZ());
1548:     } catch(char *ex) {
1549:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1550:     }
1551:     PetscLogFlops(4.0*n);
1552:   } else {
1553:     try {
1554:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1555:     } catch(char *ex) {
1556:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1557:     }
1558:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1559:     VecCUSPRestoreArrayRead(xin,&xarray);
1560:     VecCUSPRestoreArrayRead(yin,&yarray);
1561:     PetscLogFlops(5.0*n);
1562:   }
1563:   WaitForGPU();CHKERRCUSP(ierr);
1564:   return(0);
1565: }

1569: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1570: {
1572:   PetscInt       n = win->map->n;
1573:   CUSPARRAY      *xarray,*yarray,*warray;

1576:   VecCUSPGetArrayRead(xin,&xarray);
1577:   VecCUSPGetArrayRead(yin,&yarray);
1578:   VecCUSPGetArrayReadWrite(win,&warray);
1579:   try {
1580:     cusp::blas::xmy(*xarray,*yarray,*warray);
1581:   } catch(char *ex) {
1582:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1583:   }
1584:   VecCUSPRestoreArrayRead(xin,&xarray);
1585:   VecCUSPRestoreArrayRead(yin,&yarray);
1586:   VecCUSPRestoreArrayReadWrite(win,&warray);
1587:   PetscLogFlops(n);
1588:   WaitForGPU();CHKERRCUSP(ierr);
1589:   return(0);
1590: }


1593: /* should do infinity norm in cusp */

1597: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z)
1598: {
1599:   const PetscScalar *xx;
1600:   PetscErrorCode    ierr;
1601:   PetscInt          n = xin->map->n;
1602:   PetscBLASInt      one = 1, bn;
1603:   CUSPARRAY         *xarray;

1606:   PetscBLASIntCast(n,&bn);
1607:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1608:     VecCUSPGetArrayRead(xin,&xarray);
1609:     try {
1610:       *z = cusp::blas::nrm2(*xarray);
1611:     } catch(char *ex) {
1612:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1613:     }
1614:     WaitForGPU();CHKERRCUSP(ierr);
1615:     VecCUSPRestoreArrayRead(xin,&xarray);
1616:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1617:   } else if (type == NORM_INFINITY) {
1618:     PetscInt  i;
1619:     PetscReal max = 0.0,tmp;

1621:     VecGetArrayRead(xin,&xx);
1622:     for (i=0; i<n; i++) {
1623:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1624:       /* check special case of tmp == NaN */
1625:       if (tmp != tmp) {max = tmp; break;}
1626:       xx++;
1627:     }
1628:     VecRestoreArrayRead(xin,&xx);
1629:     *z   = max;
1630:   } else if (type == NORM_1) {
1631:     VecCUSPGetArrayRead(xin,&xarray);
1632: #if defined(PETSC_USE_COMPLEX)
1633: #if defined(PETSC_USE_REAL_SINGLE)
1634:     *z = cublasScasum(bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one);
1635: #else
1636:     *z = cublasDzasum(bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one);
1637: #endif
1638: #else
1639: #if defined(PETSC_USE_REAL_SINGLE)
1640:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1641: #else
1642:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1643: #endif
1644: #endif
1645:     cublasGetError();CHKERRCUSP(ierr);
1646:     VecCUSPRestoreArrayRead(xin,&xarray);
1647:     WaitForGPU();CHKERRCUSP(ierr);
1648:     PetscLogFlops(PetscMax(n-1.0,0.0));
1649:   } else if (type == NORM_1_AND_2) {
1650:     VecNorm_SeqCUSP(xin,NORM_1,z);
1651:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1652:   }
1653:   return(0);
1654: }


1657: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1661: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1662: {

1666:   VecSetRandom_SeqCUSP_Private(xin,r);
1667:   xin->valid_GPU_array = PETSC_CUSP_CPU;
1668:   return(0);
1669: }

1673: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1674: {

1678:   VecCUSPCopyFromGPU(vin);
1679:   VecResetArray_SeqCUSP_Private(vin);
1680:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1681:   return(0);
1682: }

1686: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1687: {

1691:   VecCUSPCopyFromGPU(vin);
1692:   VecPlaceArray_Seq(vin,a);
1693:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1694:   return(0);
1695: }


1700: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1701: {

1705:   VecCUSPCopyFromGPU(vin);
1706:   VecReplaceArray_Seq(vin,a);
1707:   vin->valid_GPU_array = PETSC_CUSP_CPU;
1708:   return(0);
1709: }


1714: /*@
1715:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1717:    Collective on MPI_Comm

1719:    Input Parameter:
1720: +  comm - the communicator, should be PETSC_COMM_SELF
1721: -  n - the vector length

1723:    Output Parameter:
1724: .  V - the vector

1726:    Notes:
1727:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1728:    same type as an existing vector.

1730:    Level: intermediate

1732:    Concepts: vectors^creating sequential

1734: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1735: @*/
1736: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1737: {

1741:   VecCreate(comm,v);
1742:   VecSetSizes(*v,n,n);
1743:   VecSetType(*v,VECSEQCUSP);
1744:   return(0);
1745: }

1747: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1748: template <typename T>
1749: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1750: {
1751:   __host__ __device__
1752:   T operator()(T x)
1753:   {
1754: #if defined(PETSC_USE_COMPLEX)
1755:     //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1756: #else
1757:     return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x));
1758: #endif
1759:   }
1760: };

1762: template <typename T>
1763: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1764: {
1765:   __host__ __device__
1766:   T operator()(T x,T y)
1767:   {
1768:     return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y));
1769:   }
1770: };

1774: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1775: {
1776:   PetscErrorCode                         ierr;
1777:   PetscScalar                            zero = 0.0;
1778:   PetscReal                              n=s->map->n;
1779:   thrust::tuple<PetscScalar,PetscScalar> result;
1780:   CUSPARRAY                              *sarray,*tarray;

1783:   /*VecCUSPCopyToGPU(s);
1784:    VecCUSPCopyToGPU(t);*/
1785:   VecCUSPGetArrayRead(s,&sarray);
1786:   VecCUSPGetArrayRead(t,&tarray);
1787:   try {
1788: #if defined(PETSC_USE_COMPLEX)
1789:     VecDot_SeqCUSP(s,t,dp);
1790:     VecDot_SeqCUSP(t,t,nm);
1791:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp));
1792:     //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm));
1793: #else
1794:     result = thrust::transform_reduce(
1795:               thrust::make_zip_iterator(
1796:                 thrust::make_tuple(
1797:                   sarray->begin(),
1798:                   tarray->begin())),
1799:               thrust::make_zip_iterator(
1800:                 thrust::make_tuple(
1801:                   sarray->end(),
1802:                   tarray->end())),
1803:               cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1804:               thrust::make_tuple(zero,zero),                                   /*init */
1805:               cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >());  /* binary function */
1806:     *dp = thrust::get<0>(result);
1807:     *nm = thrust::get<1>(result);
1808: #endif
1809:   } catch(char *ex) {
1810:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1811:   }
1812:   VecCUSPRestoreArrayRead(s,&sarray);
1813:   VecCUSPRestoreArrayRead(t,&tarray);
1814:   WaitForGPU();CHKERRCUSP(ierr);
1815:   PetscLogFlops(4.0*n);
1816:   return(0);
1817: }

1821: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1822: {

1826:   VecCreateSeqCUSP(PetscObjectComm((PetscObject)win),win->map->n,V);
1827:   PetscLayoutReference(win->map,&(*V)->map);
1828:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1829:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1830:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1831:   return(0);
1832: }

1836: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1837: {
1839:   Vec_Seq        *s = (Vec_Seq*)v->data;
1840:   cudaError_t    err;
1842:   try {
1843:     if (v->spptr) {
1844:       delete ((Vec_CUSP*)v->spptr)->GPUarray;
1845:       err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err);

1847:       /* If the host array has been registered as (page-locked) mapped,
1848:          one must unregister the buffer */
1849:       if (((Vec_CUSP*)v->spptr)->hostDataRegisteredAsPageLocked) {
1850:         err = cudaHostUnregister(s->array);CHKERRCUSP(err);
1851:       }
1852:       delete (Vec_CUSP*) v->spptr;
1853:     }
1854:   } catch(char *ex) {
1855:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1856:   }
1857:   VecDestroy_SeqCUSP_Private(v);
1858:   return(0);
1859: }


1862: #if defined(PETSC_USE_COMPLEX)
1863: struct conjugate 
1864: {
1865:   __host__ __device__
1866:   PetscScalar operator()(PetscScalar x)
1867:   {
1868:     return cusp::conj(x);
1869:   }
1870: };
1871: #endif


1876: PetscErrorCode VecConjugate_SeqCUSP(Vec xin)
1877: {
1879:   CUSPARRAY      *xarray;

1882:   VecCUSPGetArrayReadWrite(xin,&xarray);
1883: #if defined(PETSC_USE_COMPLEX)
1884:   thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate());
1885: #endif
1886:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
1887:   return(0);
1888: }

1892: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUSP(Vec V)
1893: {
1895:   PetscMPIInt    size;

1898:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
1899:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1900:   VecCreate_Seq_Private(V,0);
1901:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);

1903:   V->ops->dot             = VecDot_SeqCUSP;
1904:   V->ops->norm            = VecNorm_SeqCUSP;
1905:   V->ops->tdot            = VecTDot_SeqCUSP;
1906:   V->ops->scale           = VecScale_SeqCUSP;
1907:   V->ops->copy            = VecCopy_SeqCUSP;
1908:   V->ops->set             = VecSet_SeqCUSP;
1909:   V->ops->swap            = VecSwap_SeqCUSP;
1910:   V->ops->axpy            = VecAXPY_SeqCUSP;
1911:   V->ops->axpby           = VecAXPBY_SeqCUSP;
1912:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqCUSP;
1913:   V->ops->pointwisemult   = VecPointwiseMult_SeqCUSP;
1914:   V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1915:   V->ops->setrandom       = VecSetRandom_SeqCUSP;
1916:   V->ops->dot_local       = VecDot_SeqCUSP;
1917:   V->ops->tdot_local      = VecTDot_SeqCUSP;
1918:   V->ops->norm_local      = VecNorm_SeqCUSP;
1919:   V->ops->mdot_local      = VecMDot_SeqCUSP;
1920:   V->ops->maxpy           = VecMAXPY_SeqCUSP;
1921:   V->ops->mdot            = VecMDot_SeqCUSP;
1922:   V->ops->aypx            = VecAYPX_SeqCUSP;
1923:   V->ops->waxpy           = VecWAXPY_SeqCUSP;
1924:   V->ops->dotnorm2        = VecDotNorm2_SeqCUSP;
1925:   V->ops->placearray      = VecPlaceArray_SeqCUSP;
1926:   V->ops->replacearray    = VecReplaceArray_SeqCUSP;
1927:   V->ops->resetarray      = VecResetArray_SeqCUSP;
1928:   V->ops->destroy         = VecDestroy_SeqCUSP;
1929:   V->ops->duplicate       = VecDuplicate_SeqCUSP;
1930:   V->ops->conjugate       = VecConjugate_SeqCUSP;

1932:   VecCUSPAllocateCheck(V);
1933:   V->valid_GPU_array      = PETSC_CUSP_GPU;
1934:   VecSet(V,0.0);
1935:   return(0);
1936: }

1940: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a)
1941: {

1945:   *a   = 0;
1946:   VecCUSPCopyToGPU(v);
1947:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1948:   return(0);
1949: }

1953: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a)
1954: {

1958:   v->valid_GPU_array = PETSC_CUSP_GPU;

1960:   PetscObjectStateIncrease((PetscObject)v);
1961:   return(0);
1962: }

1966: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a)
1967: {

1971:   *a   = 0;
1972:   VecCUSPCopyToGPU(v);
1973:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1974:   return(0);
1975: }

1979: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a)
1980: {
1982:   return(0);
1983: }

1987: PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a)
1988: {

1992:   *a   = 0;
1993:   VecCUSPAllocateCheck(v);
1994:   *a   = ((Vec_CUSP*)v->spptr)->GPUarray;
1995:   return(0);
1996: }

2000: PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a)
2001: {

2005:   v->valid_GPU_array = PETSC_CUSP_GPU;

2007:   PetscObjectStateIncrease((PetscObject)v);
2008:   return(0);
2009: }