Actual source code: axpy.h

  1: /* $Id: axpy.h,v 1.19 2001/08/07 03:01:46 balay Exp $ */

  3: /* 
  4:    These are macros for daxpy like operations.  The format is
  5:    APXY(U,Alpha,P,n)
  6:    for
  7:    U += Alpha * P

  9:    In addition,versions that process 2 and 4 vectors are provided; 
 10:    these can give significantly better use of memory resources than
 11:    successive calls to the regular daxpy.
 12:  */

 14: #ifndef APXY

 16:  #include petscblaslapack.h

 18: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)

 20: #if defined(PETSC_HAVE_FORTRAN_CAPS)
 21: #define fortranmaxpy4_ FORTRANMAXPY4
 22: #define fortranmaxpy3_ FORTRANMAXPY3
 23: #define fortranmaxpy2_ FORTRANMAXPY2
 24: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
 25: #define fortranmaxpy4_ fortranmaxpy4
 26: #define fortranmaxpy3_ fortranmaxpy3
 27: #define fortranmaxpy2_ fortranmaxpy2
 28: #endif

 30: EXTERN_C_BEGIN
 31: EXTERN void fortranmaxpy4_(void *,void *,void *,void *,void *,void *,void *,void *,void *,int *);
 32: EXTERN void fortranmaxpy3_(void *,void *,void *,void *,void *,void *,void *,int *);
 33: EXTERN void fortranmaxpy2_(void *,void *,void *,void *,void *,int *);
 34: EXTERN_C_END

 36: #define APXY(U,a1,p1,n)  {int one=1;
 37:   BLaxpy_(&n,&a1,p1,&one,U,&one);}
 38: #define APXY2(U,a1,a2,p1,p2,n) { 
 39:   fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
 40: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) { 
 41:   fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
 42: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){ 
 43:   fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}

 45: #elif defined(PETSC_USE_UNROLL_KERNELS)

 47: #define APXY(U,Alpha,P,n) {
 48:   switch (n & 0x3) {
 49:   case 3: *U++    += Alpha * *P++;
 50:   case 2: *U++    += Alpha * *P++;
 51:   case 1: *U++    += Alpha * *P++;
 52:   n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];
 53:                                      U[2] += Alpha * P[2]; U[3] += Alpha * P[3]; 
 54:                                      U += 4; P += 4; n -= 4;}}
 55: #define APXY2(U,a1,a2,p1,p2,n) {
 56:   switch (n & 0x3) {
 57:   case 3: *U++    += a1 * *p1++ + a2 * *p2++;
 58:   case 2: *U++    += a1 * *p1++ + a2 * *p2++;
 59:   case 1: *U++    += a1 * *p1++ + a2 * *p2++;
 60:   n -= 4;case 0: break;}
 61:   while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];
 62:                U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
 63: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {
 64:   switch (n & 0x3) {
 65:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
 66:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
 67:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
 68:   n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];
 69:   U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];
 70:   U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];
 71:   U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
 72: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {
 73:   switch (n & 0x3) {
 74:   case 3: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
 75:   case 2: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
 76:   case 1: *U++    += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
 77:   n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];
 78:   U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];
 79:   U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];
 80:   U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}

 82: #elif defined(PETSC_USE_WHILE_KERNELS)

 84: #define APXY(U,a1,p1,n)  {
 85:   while (n--) *U++ += a1 * *p1++;}
 86: #define APXY2(U,a1,a2,p1,p2,n)  {
 87:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
 88: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {
 89:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
 90: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {
 91:   while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}

 93: #elif defined(PETSC_USE_BLAS_KERNELS)

 95: #define APXY(U,a1,p1,n)  {int one=1;
 96:   daxpy_(&n,&a1,p1,&one,U,&one);}
 97: #define APXY2(U,a1,a2,p1,p2,n)  {int one=1,two=2,off=(int)(p2-p1);
 98:   double fone=1.0,aa[2];
 99: aa[0]=a1;aa[1]=a2;
100:   dgemv_("N",&n,&two,&fone,p1,&off,aa,&one,&fone,U,&one,1);}
101: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);
102:   APXY(U,a3,a4,p3,n);}
103: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);
104:   APXY2(U,a3,a4,p3,p4,n);}

106: #elif defined(PETSC_USE_FOR_KERNELS)

108: #define APXY(U,a1,p1,n)  {int __i;PetscScalar __s1,__s2; 
109:   for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];
110:   __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}
111:   if (n & 0x1) U[__i] += a1 * p1[__i];}
112: #define APXY2(U,a1,a2,p1,p2,n) {int __i;
113:   for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
114: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;
115:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
116: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;
117:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

119: #else

121: #define APXY(U,a1,p1,n)  {int __i;PetscScalar _a1=a1;
122:   for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
123: #define APXY2(U,a1,a2,p1,p2,n) {int __i;
124:   for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
125: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;
126:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
127: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;
128:   for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}

130: #endif


133: /* ----------------------------------------------------------------------------
134:       axpy() but for increments of inc in both U and P 
135:    ---------------------------------------------------------------------------*/
136: #ifdef PETSC_USE_UNROLL_KERNELS
137: #define APXYINC(U,Alpha,P,n,inc) {
138: if (n & 0x1) {
139: *U    += Alpha * *P; U += inc; P += inc; n--;}
140: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];
141: U += 2*inc; P += 2*inc; n -= 2;}}
142: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {
143: if (n & 0x1) {
144: *U    += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}
145: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];
146: U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
147: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
148: if (n & 0x1) {149: *U    += a1 * *p1 + a2 * *p2 + a3 * *p3; 150:     U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}151: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];152: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];153: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
154: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {
155: ;if (n & 0x1) {
156: *U    += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4; 
157:     U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}
158: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];
159: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];
160: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}

162: #elif defined(PETSC_USE_WHILE_KERNELS)
163: #define APXYINC(U,a1,p1,n,inc) {
164: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
165: #define APXY2INC(U,a1,a2,p1,p2,n,inc)  {
166: while (n--) {*U += a1 * *p1 + a2 * *p2;
167: U+=inc;p1+=inc;p2+=inc;}}
168: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){
169: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
170: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {
171: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;
172: p2+=inc;p3+=inc;p4+=inc;}}

174: #else
175: /* These need to be converted to for loops */
176: #define APXYINC(U,a1,p1,n,inc) {
177: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
178: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {
179: while (n--) {*U += a1 * *p1 + a2 * *p2;
180: U+=inc;p1+=inc;p2+=inc;}}
181: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
182: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
183: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){
184: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;
185: p2+=inc;p3+=inc;p4+=inc;}}
186: #endif

188: /* --------------------------------------------------------------------
189:    This is aypx:
190:     for (i=0; i<n; i++) 
191:        y[i] = x[i] + alpha * y[i];
192:   ---------------------------------------------------------------------*/
193: #if defined(PETSC_USE_UNROLL_KERNELS)
194: #define AYPX(U,Alpha,P,n) {
195: switch (n & 0x3) {
196: case 3: *U    = *P++ + Alpha * *U;U++;
197: case 2: *U    = *P++ + Alpha * *U;U++;
198: case 1: *U    = *P++ + Alpha * *U;U++;
199: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];
200: U[1] = P[1] + Alpha * U[1];
201: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3]; 
202: U += 4; P += 4; n -= 4;}}

204: #elif defined(PETSC_USE_WHILE_KERNELS)
205: #define AYPX(U,a1,p1,n)  {
206: while (n--) {*U = *p1++ + a1 * *U;U++;}

208: #elif defined(PETSC_USE_FOR_KERNELS)
209: #define AYPX(U,a1,p1,n)  {int __i;PetscScalar __s1,__s2; 
210: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];
211: __s1+=a1*U[__i];__s2+=a1*U[__i+1];
212: U[__i]=__s1;U[__i+1]=__s2;}
213: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}

215: #else
216: #define AYPX(U,a1,p1,n)  {int __i;
217: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
218: #endif

220: /* ----------------------------------------------------------------------------------
221:        Useful for APXY where alpha == -1 
222:   ----------------------------------------------------------------------------------
223:   */
224: #define YMX(U,p1,n)  {int __i;
225: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
226: /* Useful for APXY where alpha == 1 */
227: #define YPX(U,p1,n)  {int __i;
228: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}

230: #endif