Actual source code: ilu.h
1: /* $Id: ilu.h,v 1.29 2001/08/07 03:01:46 balay Exp $ */
2: /*
3: Kernels used in sparse ILU (and LU) and in the resulting triangular
4: solves. These are for block algorithms where the block sizes are on
5: the order of 2-6+.
7: There are TWO versions of the macros below.
8: 1) standard for MatScalar == PetscScalar use the standard BLAS for
9: block size larger than 7 and
10: 2) handcoded Fortran single precision for the matrices, since BLAS
11: does not have some arguments in single and some in double.
13: */
17: #include petscblaslapack.h
19: /*
20: These are C kernels,they are contained in
21: src/mat/impls/baij/seq
22: */
24: EXTERN int LINPACKdgefa(MatScalar *,int,int *);
25: EXTERN int LINPACKdgedi(MatScalar *,int,int *,MatScalar*);
26: EXTERN int Kernel_A_gets_inverse_A_2(MatScalar *);
27: EXTERN int Kernel_A_gets_inverse_A_3(MatScalar *);
29: #define PETSC_INLINE_INVERT4by4
30: #if defined(PETSC_INLINE_INVERT4by4)
31: #define Kernel_A_gets_inverse_A_4(mat) 0;
32: {
33: MatScalar d, di;
34:
35: di = mat[0];
36: mat[0] = d = 1.0 / di;
37: mat[4] *= -d;
38: mat[8] *= -d;
39: mat[12] *= -d;
40: mat[1] *= d;
41: mat[2] *= d;
42: mat[3] *= d;
43: mat[5] += mat[4] * mat[1] * di;
44: mat[6] += mat[4] * mat[2] * di;
45: mat[7] += mat[4] * mat[3] * di;
46: mat[9] += mat[8] * mat[1] * di;
47: mat[10] += mat[8] * mat[2] * di;
48: mat[11] += mat[8] * mat[3] * di;
49: mat[13] += mat[12] * mat[1] * di;
50: mat[14] += mat[12] * mat[2] * di;
51: mat[15] += mat[12] * mat[3] * di;
52: di = mat[5];
53: mat[5] = d = 1.0 / di;
54: mat[1] *= -d;
55: mat[9] *= -d;
56: mat[13] *= -d;
57: mat[4] *= d;
58: mat[6] *= d;
59: mat[7] *= d;
60: mat[0] += mat[1] * mat[4] * di;
61: mat[2] += mat[1] * mat[6] * di;
62: mat[3] += mat[1] * mat[7] * di;
63: mat[8] += mat[9] * mat[4] * di;
64: mat[10] += mat[9] * mat[6] * di;
65: mat[11] += mat[9] * mat[7] * di;
66: mat[12] += mat[13] * mat[4] * di;
67: mat[14] += mat[13] * mat[6] * di;
68: mat[15] += mat[13] * mat[7] * di;
69: di = mat[10];
70: mat[10] = d = 1.0 / di;
71: mat[2] *= -d;
72: mat[6] *= -d;
73: mat[14] *= -d;
74: mat[8] *= d;
75: mat[9] *= d;
76: mat[11] *= d;
77: mat[0] += mat[2] * mat[8] * di;
78: mat[1] += mat[2] * mat[9] * di;
79: mat[3] += mat[2] * mat[11] * di;
80: mat[4] += mat[6] * mat[8] * di;
81: mat[5] += mat[6] * mat[9] * di;
82: mat[7] += mat[6] * mat[11] * di;
83: mat[12] += mat[14] * mat[8] * di;
84: mat[13] += mat[14] * mat[9] * di;
85: mat[15] += mat[14] * mat[11] * di;
86: di = mat[15];
87: mat[15] = d = 1.0 / di;
88: mat[3] *= -d;
89: mat[7] *= -d;
90: mat[11] *= -d;
91: mat[12] *= d;
92: mat[13] *= d;
93: mat[14] *= d;
94: mat[0] += mat[3] * mat[12] * di;
95: mat[1] += mat[3] * mat[13] * di;
96: mat[2] += mat[3] * mat[14] * di;
97: mat[4] += mat[7] * mat[12] * di;
98: mat[5] += mat[7] * mat[13] * di;
99: mat[6] += mat[7] * mat[14] * di;
100: mat[8] += mat[11] * mat[12] * di;
101: mat[9] += mat[11] * mat[13] * di;
102: mat[10] += mat[11] * mat[14] * di;
103: }
104: #else
105: EXTERN int Kernel_A_gets_inverse_A_4(MatScalar *);
106: # if defined(PETSC_HAVE_SSE)
107: EXTERN int Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
108: # endif
109: #endif
111: EXTERN int Kernel_A_gets_inverse_A_5(MatScalar *);
112: EXTERN int Kernel_A_gets_inverse_A_6(MatScalar *);
113: EXTERN int Kernel_A_gets_inverse_A_7(MatScalar *);
116: /*
117: A = inv(A) A_gets_inverse_A
119: A - square bs by bs array stored in column major order
120: pivots - integer work array of length bs
121: W - bs by 1 work array
122: */
123: #define Kernel_A_gets_inverse_A(bs,A,pivots,W)
124: {
125: LINPACKdgefa((A),(bs),(pivots));
126: LINPACKdgedi((A),(bs),(pivots),(W));
127: }
129: /* -----------------------------------------------------------------------*/
131: #if !defined(PETSC_USE_MAT_SINGLE)
132: /*
133: Version that calls the BLAS directly
134: */
135: /*
136: A = A * B A_gets_A_times_B
138: A, B - square bs by bs arrays stored in column major order
139: W - square bs by bs work array
141: */
142: #define Kernel_A_gets_A_times_B(bs,A,B,W)
143: {
144: PetscScalar _one = 1.0,_zero = 0.0;
145: int _ierr;
146: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr);
147: BLgemm_("N","N",&(bs),&(bs),&(bs),&_one,(W),&(bs),(B),&(bs),&_zero,(A),&(bs));
148: }
150: /*
152: A = A - B * C A_gets_A_minus_B_times_C
154: A, B, C - square bs by bs arrays stored in column major order
155: */
156: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C)
157: {
158: PetscScalar _mone = -1.0,_one = 1.0;
159: BLgemm_("N","N",&(bs),&(bs),&(bs),&_mone,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
160: }
162: /*
164: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
166: A, B, C - square bs by bs arrays stored in column major order
167: */
168: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
169: {
170: PetscScalar _one = 1.0;
171: BLgemm_("T","N",&(bs),&(bs),&(bs),&_one,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
172: }
174: /*
175: v = v + A^T w v_gets_v_plus_Atranspose_times_w
177: v - array of length bs
178: A - square bs by bs array
179: w - array of length bs
180: */
181: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
182: {
183: PetscScalar _one = 1.0;
184: int _ione = 1;
185: LAgemv_("T",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione);
186: }
188: /*
189: v = v - A w v_gets_v_minus_A_times_w
191: v - array of length bs
192: A - square bs by bs array
193: w - array of length bs
194: */
195: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w)
196: {
197: PetscScalar _mone = -1.0,_one = 1.0;
198: int _ione = 1;
199: LAgemv_("N",&(bs),&(bs),&_mone,A,&(bs),w,&_ione,&_one,v,&_ione);
200: }
202: /*
203: v = v + A w v_gets_v_plus_A_times_w
205: v - array of length bs
206: A - square bs by bs array
207: w - array of length bs
208: */
209: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w)
210: {
211: PetscScalar _one = 1.0;
212: int _ione = 1;
213: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione);
214: }
216: /*
217: v = v + A w v_gets_v_plus_Ar_times_w
219: v - array of length bs
220: A - square bs by bs array
221: w - array of length bs
222: */
223: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w)
224: {
225: PetscScalar _one = 1.0;
226: int _ione = 1;
227: LAgemv_("N",&(bs),&(ncols),&_one,A,&(bs),v,&_ione,&_one,w,&_ione);
228: }
230: /*
231: w = A v w_gets_A_times_v
233: v - array of length bs
234: A - square bs by bs array
235: w - array of length bs
236: */
237: #define Kernel_w_gets_A_times_v(bs,v,A,w)
238: {
239: PetscScalar _zero = 0.0,_one = 1.0;
240: int _ione = 1;
241: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),v,&_ione,&_zero,w,&_ione);
242: }
244: /*
245: z = A*x
246: */
247: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z)
248: {
249: PetscScalar _one = 1.0,_zero = 0.0;
250: int _ione = 1;
251: LAgemv_("N",&bs,&ncols,&_one,A,&bs,x,&_ione,&_zero,z,&_ione);
252: }
254: /*
255: z = trans(A)*x
256: */
257: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z)
258: {
259: PetscScalar _one = 1.0;
260: int _ione = 1;
261: LAgemv_("T",&bs,&ncols,&_one,A,&bs,x,&_ione,&_one,z,&_ione);
262: }
264: #else
265: /*
266: Version that calls Fortran routines; can handle different precision
267: of matrix (array) and vectors
268: */
269: /*
270: These are Fortran kernels: They replace certain BLAS routines but
271: have some arguments that may be single precision,rather than double
272: These routines are provided in src/fortran/kernels/sgemv.F
273: They are pretty pitiful but get the job done. The intention is
274: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
275: code is used.
276: */
277: #ifdef PETSC_HAVE_FORTRAN_CAPS
278: #define msgemv_ MSGEMV
279: #define msgemvp_ MSGEMVP
280: #define msgemvm_ MSGEMVM
281: #define msgemvt_ MSGEMVT
282: #define msgemmi_ MSGEMMI
283: #define msgemm_ MSGEMM
284: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
285: #define msgemv_ msgemv
286: #define msgemvp_ msgemvp
287: #define msgemvm_ msgemvm
288: #define msgemvt_ msgemvt
289: #define msgemmi_ msgemmi
290: #define msgemm_ msgemm
291: #endif
292: EXTERN_C_BEGIN
293: EXTERN void msgemv_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
294: EXTERN void msgemvp_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
295: EXTERN void msgemvm_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
296: EXTERN void msgemvt_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
297: EXTERN void msgemmi_(int *,MatScalar*,MatScalar*,MatScalar*);
298: EXTERN void msgemm_(int *,MatScalar*,MatScalar*,MatScalar*);
299: EXTERN_C_END
301: /*
302: A = A * B A_gets_A_times_B
304: A, B - square bs by bs arrays stored in column major order
305: W - square bs by bs work array
307: */
308: #define Kernel_A_gets_A_times_B(bs,A,B,W)
309: {
310: int _ierr;
311: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr);
312: msgemmi_(&bs,A,B,W);
313: }
315: /*
317: A = A - B * C A_gets_A_minus_B_times_C
319: A, B, C - square bs by bs arrays stored in column major order
320: */
321: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C)
322: {
323: msgemm_(&bs,A,B,C);
324: }
326: /*
327: v = v - A w v_gets_v_minus_A_times_w
329: v - array of length bs
330: A - square bs by bs array
331: w - array of length bs
332: */
333: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w)
334: {
335: msgemvm_(&bs,&bs,A,w,v);
336: }
338: /*
339: v = v + A w v_gets_v_plus_A_times_w
341: v - array of length bs
342: A - square bs by bs array
343: w - array of length bs
344: */
345: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w)
346: {
347: msgemvp_(&bs,&bs,A,w,v);
348: }
350: /*
351: v = v + A w v_gets_v_plus_Ar_times_w
353: v - array of length bs
354: A - square bs by bs array
355: w - array of length bs
356: */
357: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w)
358: {
359: msgemvp_(&bs,&ncol,A,v,w);
360: }
362: /*
363: w = A v w_gets_A_times_v
365: v - array of length bs
366: A - square bs by bs array
367: w - array of length bs
368: */
369: #define Kernel_w_gets_A_times_v(bs,v,A,w)
370: {
371: msgemv_(&bs,&bs,A,v,w);
372: }
373:
374: /*
375: z = A*x
376: */
377: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z)
378: {
379: msgemv_(&bs,&ncols,A,x,z);
380: }
382: /*
383: z = trans(A)*x
384: */
385: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z)
386: {
387: msgemvt_(&bs,&ncols,A,x,z);
388: }
390: /* These do not work yet */
391: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
392: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
395: #endif
397: #endif