Actual source code: axpy.h
2: /*
3: These are macros for daxpy like operations. The format is
4: APXY(U,Alpha,P,n)
5: for
6: U += Alpha * P
8: In addition,versions that process 2 and 4 vectors are provided;
9: these can give significantly better use of memory resources than
10: successive calls to the regular daxpy.
11: */
13: #ifndef APXY
15: #include petscblaslapack.h
17: #if defined(PETSC_HAVE_FORTRAN_CAPS)
18: #define fortrancopy_ FORTRANCOPY
19: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
20: #define fortrancopy_ fortrancopy
21: #endif
26: #if defined(PETSC_HAVE_FORTRAN_CAPS)
27: #define fortranzero_ FORTRANZERO
28: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
29: #define fortranzero_ fortranzero
30: #endif
36: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
37: #if defined(PETSC_HAVE_FORTRAN_CAPS)
38: #define fortranaypx_ FORTRANAYPX
39: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
40: #define fortranaypx_ fortranaypx
41: #endif
45: #endif
47: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
48: #if defined(PETSC_HAVE_FORTRAN_CAPS)
49: #define fortranwaxpy_ FORTRANWAXPY
50: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
51: #define fortranwaxpy_ fortranwaxpy
52: #endif
56: #endif
58: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
60: #if defined(PETSC_HAVE_FORTRAN_CAPS)
61: #define fortranmaxpy4_ FORTRANMAXPY4
62: #define fortranmaxpy3_ FORTRANMAXPY3
63: #define fortranmaxpy2_ FORTRANMAXPY2
64: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
65: #define fortranmaxpy4_ fortranmaxpy4
66: #define fortranmaxpy3_ fortranmaxpy3
67: #define fortranmaxpy2_ fortranmaxpy2
68: #endif
71: EXTERN void fortranmaxpy4_(void*,void*,void*,void*,void*,void*,void*,void*,void*,PetscInt*);
72: EXTERN void fortranmaxpy3_(void*,void*,void*,void*,void*,void*,void*,PetscInt*);
73: EXTERN void fortranmaxpy2_(void*,void*,void*,void*,void*,PetscInt*);
76: #define APXY(U,a1,p1,n) {PetscBLASInt one=1;\
77: BLaxpy_(&n,&a1,p1,&one,U,&one);}
78: #define APXY2(U,a1,a2,p1,p2,n) { \
79: fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
80: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) { \
81: fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
82: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){ \
83: fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}
85: #elif defined(PETSC_USE_UNROLL_KERNELS)
87: #define APXY(U,Alpha,P,n) {\
88: switch (n & 0x3) {\
89: case 3: *U++ += Alpha * *P++;\
90: case 2: *U++ += Alpha * *P++;\
91: case 1: *U++ += Alpha * *P++;\
92: n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];\
93: U[2] += Alpha * P[2]; U[3] += Alpha * P[3]; \
94: U += 4; P += 4; n -= 4;}}
95: #define APXY2(U,a1,a2,p1,p2,n) {\
96: switch (n & 0x3) {\
97: case 3: *U++ += a1 * *p1++ + a2 * *p2++;\
98: case 2: *U++ += a1 * *p1++ + a2 * *p2++;\
99: case 1: *U++ += a1 * *p1++ + a2 * *p2++;\
100: n -= 4;case 0: break;}\
101: while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];\
102: U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
103: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
104: switch (n & 0x3) {\
105: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
106: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
107: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
108: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];\
109: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];\
110: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];\
111: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
112: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
113: switch (n & 0x3) {\
114: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
115: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
116: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
117: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
118: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];\
119: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];\
120: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}
122: #elif defined(PETSC_USE_WHILE_KERNELS)
124: #define APXY(U,a1,p1,n) {\
125: while (n--) *U++ += a1 * *p1++;}
126: #define APXY2(U,a1,a2,p1,p2,n) {\
127: while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
128: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
129: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
130: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
131: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}
133: #elif defined(PETSC_USE_BLAS_KERNELS)
135: #define APXY(U,a1,p1,n) {PetscBLASInt one=1;\
136: BLaxpy_(&n,&a1,p1,&one,U,&one);}
137: #define APXY2(U,a1,a2,p1,p2,n) {PetscBLASInt one=1,two=2,off=(PetscBLASInt)(p2-p1);\
138: PetscScalar fone=1.0,aa[2];\
139: aa[0]=a1;aa[1]=a2;\
140: LAgemv_("N",&n,&two,&fone,p1,&off,aa,&one,&fone,U,&one);}
141: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);\
142: APXY(U,a3,p3,n);}
143: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);\
144: APXY2(U,a3,a4,p3,p4,n);}
146: #elif defined(PETSC_USE_FOR_KERNELS)
148: #define APXY(U,a1,p1,n) {PetscInt __i;PetscScalar __s1,__s2; \
149: for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];\
150: __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\
151: if (n & 0x1) U[__i] += a1 * p1[__i];}
152: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
153: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
154: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
155: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
156: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
157: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
159: #else
161: #define APXY(U,a1,p1,n) {PetscInt __i;PetscScalar _a1=a1;\
162: for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
163: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
164: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
165: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
166: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
167: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
168: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
170: #endif
173: /* ----------------------------------------------------------------------------
174: axpy() but for increments of inc in both U and P
175: ---------------------------------------------------------------------------*/
176: #ifdef PETSC_USE_UNROLL_KERNELS
177: #define APXYINC(U,Alpha,P,n,inc) {\
178: if (n & 0x1) {\
179: *U += Alpha * *P; U += inc; P += inc; n--;}\
180: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];\
181: U += 2*inc; P += 2*inc; n -= 2;}}
182: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
183: if (n & 0x1) {\
184: *U += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}\
185: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];\
186: U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
187: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
188: if (n & 0x1) {\
189: *U += a1 * *p1 + a2 * *p2 + a3 * *p3; \
190: U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}\
191: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];\
192: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];\
193: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
194: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
195: ;if (n & 0x1) {\
196: *U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4; \
197: U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}\
198: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
199: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];\
200: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}
202: #elif defined(PETSC_USE_WHILE_KERNELS)
203: #define APXYINC(U,a1,p1,n,inc) {\
204: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
205: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
206: while (n--) {*U += a1 * *p1 + a2 * *p2;\
207: U+=inc;p1+=inc;p2+=inc;}}
208: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){\
209: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
210: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
211: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
212: p2+=inc;p3+=inc;p4+=inc;}}
214: #else
215: /* These need to be converted to for loops */
216: #define APXYINC(U,a1,p1,n,inc) {\
217: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
218: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
219: while (n--) {*U += a1 * *p1 + a2 * *p2;\
220: U+=inc;p1+=inc;p2+=inc;}}
221: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
222: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
223: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){\
224: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
225: p2+=inc;p3+=inc;p4+=inc;}}
226: #endif
228: /* --------------------------------------------------------------------
229: This is aypx:
230: for (i=0; i<n; i++)
231: y[i] = x[i] + alpha * y[i];
232: ---------------------------------------------------------------------*/
233: #if defined(PETSC_USE_UNROLL_KERNELS)
234: #define AYPX(U,Alpha,P,n) {\
235: switch (n & 0x3) {\
236: case 3: *U = *P++ + Alpha * *U;U++;\
237: case 2: *U = *P++ + Alpha * *U;U++;\
238: case 1: *U = *P++ + Alpha * *U;U++;\
239: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];\
240: U[1] = P[1] + Alpha * U[1];\
241: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3]; \
242: U += 4; P += 4; n -= 4;}}
244: #elif defined(PETSC_USE_WHILE_KERNELS)
245: #define AYPX(U,a1,p1,n) {\
246: while (n--) {*U = *p1++ + a1 * *U;U++;}
248: #elif defined(PETSC_USE_FOR_KERNELS)
249: #define AYPX(U,a1,p1,n) {PetscInt __i;PetscScalar __s1,__s2; \
250: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];\
251: __s1+=a1*U[__i];__s2+=a1*U[__i+1];\
252: U[__i]=__s1;U[__i+1]=__s2;}\
253: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}
255: #else
256: #define AYPX(U,a1,p1,n) {PetscInt __i;\
257: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
258: #endif
260: /* ----------------------------------------------------------------------------------
261: Useful for APXY where alpha == -1
262: ----------------------------------------------------------------------------------
263: */
264: #define YMX(U,p1,n) {PetscInt __i;\
265: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
266: /* Useful for APXY where alpha == 1 */
267: #define YPX(U,p1,n) {PetscInt __i;\
268: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}
270: #endif