Actual source code: axpy.h
1: /* $Id: axpy.h,v 1.18 2000/08/16 15:17:59 balay Exp $ */
3: /*
4: These are macros for daxpy like operations. The format is
5: APXY(U,Alpha,P,n)
6: for
7: U += Alpha * P
9: In addition,versions that process 2 and 4 vectors are provided;
10: these can give significantly better use of memory resources than
11: successive calls to the regular daxpy.
12: */
14: #ifndef APXY
16: #include "petscblaslapack.h"
18: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
20: #if defined(PETSC_HAVE_FORTRAN_CAPS)
21: #define fortranmaxpy4_ FORTRANMAXPY4
22: #define fortranmaxpy3_ FORTRANMAXPY3
23: #define fortranmaxpy2_ FORTRANMAXPY2
24: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
25: #define fortranmaxpy4_ fortranmaxpy4
26: #define fortranmaxpy3_ fortranmaxpy3
27: #define fortranmaxpy2_ fortranmaxpy2
28: #endif
30: EXTERN_C_BEGIN
31: EXTERN void fortranmaxpy4_(void *,void *,void *,void *,void *,void *,void *,void *,void *,int *);
32: EXTERN void fortranmaxpy3_(void *,void *,void *,void *,void *,void *,void *,int *);
33: EXTERN void fortranmaxpy2_(void *,void *,void *,void *,void *,int *);
34: EXTERN_C_END
36: #define APXY(U,a1,p1,n) {int one=1;
37: BLaxpy_(&n,&a1,p1,&one,U,&one);}
38: #define APXY2(U,a1,a2,p1,p2,n) {
39: fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
40: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {
41: fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
42: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){
43: fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}
45: #elif defined(PETSC_USE_UNROLL_KERNELS)
47: #define APXY(U,Alpha,P,n) {
48: switch (n & 0x3) {
49: case 3: *U++ += Alpha * *P++;
50: case 2: *U++ += Alpha * *P++;
51: case 1: *U++ += Alpha * *P++;
52: n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];
53: U[2] += Alpha * P[2]; U[3] += Alpha * P[3];
54: U += 4; P += 4; n -= 4;}}
55: #define APXY2(U,a1,a2,p1,p2,n) {
56: switch (n & 0x3) {
57: case 3: *U++ += a1 * *p1++ + a2 * *p2++;
58: case 2: *U++ += a1 * *p1++ + a2 * *p2++;
59: case 1: *U++ += a1 * *p1++ + a2 * *p2++;
60: n -= 4;case 0: break;}
61: while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];
62: U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
63: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {
64: switch (n & 0x3) {
65: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
66: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
67: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;
68: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];
69: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];
70: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];
71: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
72: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {
73: switch (n & 0x3) {
74: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
75: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
76: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;
77: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];
78: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];
79: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];
80: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}
82: #elif defined(PETSC_USE_WHILE_KERNELS)
84: #define APXY(U,a1,p1,n) {
85: while (n--) *U++ += a1 * *p1++;}
86: #define APXY2(U,a1,a2,p1,p2,n) {
87: while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
88: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {
89: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
90: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {
91: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}
93: #elif defined(PETSC_USE_BLAS_KERNELS)
95: #define APXY(U,a1,p1,n) {int one=1;
96: daxpy_(&n,&a1,p1,&one,U,&one);}
97: #define APXY2(U,a1,a2,p1,p2,n) {int one=1,two=2,off=(int)(p2-p1);
98: double fone=1.0,aa[2];
99: aa[0]=a1;aa[1]=a2;
100: dgemv_("N",&n,&two,&fone,p1,&off,aa,&one,&fone,U,&one,1);}
101: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);
102: APXY(U,a3,a4,p3,n);}
103: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);
104: APXY2(U,a3,a4,p3,p4,n);}
106: #elif defined(PETSC_USE_FOR_KERNELS)
108: #define APXY(U,a1,p1,n) {int __i;Scalar __s1,__s2;
109: for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];
110: __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}
111: if (n & 0x1) U[__i] += a1 * p1[__i];}
112: #define APXY2(U,a1,a2,p1,p2,n) {int __i;
113: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
114: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;
115: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
116: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;
117: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
119: #else
121: #define APXY(U,a1,p1,n) {int __i;Scalar _a1=a1;
122: for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
123: #define APXY2(U,a1,a2,p1,p2,n) {int __i;
124: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
125: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){int __i;
126: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
127: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){int __i;
128: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
130: #endif
133: /* ----------------------------------------------------------------------------
134: axpy() but for increments of inc in both U and P
135: ---------------------------------------------------------------------------*/
136: #ifdef PETSC_USE_UNROLL_KERNELS
137: #define APXYINC(U,Alpha,P,n,inc) {
138: if (n & 0x1) {
139: *U += Alpha * *P; U += inc; P += inc; n--;}
140: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];
141: U += 2*inc; P += 2*inc; n -= 2;}}
142: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {
143: if (n & 0x1) {
144: *U += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}
145: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];
146: U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
147: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
148: if (n & 0x1) {149: *U += a1 * *p1 + a2 * *p2 + a3 * *p3; 150: U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}151: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];152: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];153: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
154: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {
155: ;if (n & 0x1) {
156: *U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;
157: U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}
158: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];
159: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];
160: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}
162: #elif defined(PETSC_USE_WHILE_KERNELS)
163: #define APXYINC(U,a1,p1,n,inc) {
164: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
165: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {
166: while (n--) {*U += a1 * *p1 + a2 * *p2;
167: U+=inc;p1+=inc;p2+=inc;}}
168: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){
169: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
170: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {
171: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;
172: p2+=inc;p3+=inc;p4+=inc;}}
174: #else
175: /* These need to be converted to for loops */
176: #define APXYINC(U,a1,p1,n,inc) {
177: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
178: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {
179: while (n--) {*U += a1 * *p1 + a2 * *p2;
180: U+=inc;p1+=inc;p2+=inc;}}
181: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {
182: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
183: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){
184: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;
185: p2+=inc;p3+=inc;p4+=inc;}}
186: #endif
188: /* --------------------------------------------------------------------
189: This is aypx:
190: for (i=0; i<n; i++)
191: y[i] = x[i] + alpha * y[i];
192: ---------------------------------------------------------------------*/
193: #if defined(PETSC_USE_UNROLL_KERNELS)
194: #define AYPX(U,Alpha,P,n) {
195: switch (n & 0x3) {
196: case 3: *U = *P++ + Alpha * *U;U++;
197: case 2: *U = *P++ + Alpha * *U;U++;
198: case 1: *U = *P++ + Alpha * *U;U++;
199: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];
200: U[1] = P[1] + Alpha * U[1];
201: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3];
202: U += 4; P += 4; n -= 4;}}
204: #elif defined(PETSC_USE_WHILE_KERNELS)
205: #define AYPX(U,a1,p1,n) {
206: while (n--) {*U = *p1++ + a1 * *U;U++;}
208: #elif defined(PETSC_USE_FOR_KERNELS)
209: #define AYPX(U,a1,p1,n) {int __i;Scalar __s1,__s2;
210: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];
211: __s1+=a1*U[__i];__s2+=a1*U[__i+1];
212: U[__i]=__s1;U[__i+1]=__s2;}
213: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}
215: #else
216: #define AYPX(U,a1,p1,n) {int __i;
217: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
218: #endif
220: /* ----------------------------------------------------------------------------------
221: Useful for APXY where alpha == -1
222: ----------------------------------------------------------------------------------
223: */
224: #define YMX(U,p1,n) {int __i;
225: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
226: /* Useful for APXY where alpha == 1 */
227: #define YPX(U,p1,n) {int __i;
228: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}
230: #endif