Actual source code: ilu.h

  1: /* $Id: ilu.h,v 1.29 2001/08/07 03:01:46 balay Exp $ */
  2: /*
  3:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  4:  solves. These are for block algorithms where the block sizes are on 
  5:  the order of 2-6+.

  7:     There are TWO versions of the macros below. 
  8:     1) standard for MatScalar == PetscScalar use the standard BLAS for 
  9:        block size larger than 7 and
 10:     2) handcoded Fortran single precision for the matrices, since BLAS
 11:        does not have some arguments in single and some in double.

 13: */

 17:  #include petscblaslapack.h

 19: /*
 20:       These are C kernels,they are contained in 
 21:    src/mat/impls/baij/seq
 22: */

 24: EXTERN int  LINPACKdgefa(MatScalar *,int,int *);
 25: EXTERN int  LINPACKdgedi(MatScalar *,int,int *,MatScalar*);
 26: EXTERN int  Kernel_A_gets_inverse_A_2(MatScalar *);
 27: EXTERN int  Kernel_A_gets_inverse_A_3(MatScalar *);

 29: #define PETSC_INLINE_INVERT4by4
 30: #if defined(PETSC_INLINE_INVERT4by4)
 31: #define Kernel_A_gets_inverse_A_4(mat) 0;
 32: {
 33:   MatScalar d, di;
 34: 
 35:   di = mat[0];
 36:   mat[0] = d = 1.0 / di;
 37:   mat[4] *= -d;
 38:   mat[8] *= -d;
 39:   mat[12] *= -d;
 40:   mat[1] *= d;
 41:   mat[2] *= d;
 42:   mat[3] *= d;
 43:   mat[5] += mat[4] * mat[1] * di;
 44:   mat[6] += mat[4] * mat[2] * di;
 45:   mat[7] += mat[4] * mat[3] * di;
 46:   mat[9] += mat[8] * mat[1] * di;
 47:   mat[10] += mat[8] * mat[2] * di;
 48:   mat[11] += mat[8] * mat[3] * di;
 49:   mat[13] += mat[12] * mat[1] * di;
 50:   mat[14] += mat[12] * mat[2] * di;
 51:   mat[15] += mat[12] * mat[3] * di;
 52:   di = mat[5];
 53:   mat[5] = d = 1.0 / di;
 54:   mat[1] *= -d;
 55:   mat[9] *= -d;
 56:   mat[13] *= -d;
 57:   mat[4] *= d;
 58:   mat[6] *= d;
 59:   mat[7] *= d;
 60:   mat[0] += mat[1] * mat[4] * di;
 61:   mat[2] += mat[1] * mat[6] * di;
 62:   mat[3] += mat[1] * mat[7] * di;
 63:   mat[8] += mat[9] * mat[4] * di;
 64:   mat[10] += mat[9] * mat[6] * di;
 65:   mat[11] += mat[9] * mat[7] * di;
 66:   mat[12] += mat[13] * mat[4] * di;
 67:   mat[14] += mat[13] * mat[6] * di;
 68:   mat[15] += mat[13] * mat[7] * di;
 69:   di = mat[10];
 70:   mat[10] = d = 1.0 / di;
 71:   mat[2] *= -d;
 72:   mat[6] *= -d;
 73:   mat[14] *= -d;
 74:   mat[8] *= d;
 75:   mat[9] *= d;
 76:   mat[11] *= d;
 77:   mat[0] += mat[2] * mat[8] * di;
 78:   mat[1] += mat[2] * mat[9] * di;
 79:   mat[3] += mat[2] * mat[11] * di;
 80:   mat[4] += mat[6] * mat[8] * di;
 81:   mat[5] += mat[6] * mat[9] * di;
 82:   mat[7] += mat[6] * mat[11] * di;
 83:   mat[12] += mat[14] * mat[8] * di;
 84:   mat[13] += mat[14] * mat[9] * di;
 85:   mat[15] += mat[14] * mat[11] * di;
 86:   di = mat[15];
 87:   mat[15] = d = 1.0 / di;
 88:   mat[3] *= -d;
 89:   mat[7] *= -d;
 90:   mat[11] *= -d;
 91:   mat[12] *= d;
 92:   mat[13] *= d;
 93:   mat[14] *= d;
 94:   mat[0] += mat[3] * mat[12] * di;
 95:   mat[1] += mat[3] * mat[13] * di;
 96:   mat[2] += mat[3] * mat[14] * di;
 97:   mat[4] += mat[7] * mat[12] * di;
 98:   mat[5] += mat[7] * mat[13] * di;
 99:   mat[6] += mat[7] * mat[14] * di;
100:   mat[8] += mat[11] * mat[12] * di;
101:   mat[9] += mat[11] * mat[13] * di;
102:   mat[10] += mat[11] * mat[14] * di;
103: }
104: #else
105: EXTERN int  Kernel_A_gets_inverse_A_4(MatScalar *);
106: # if defined(PETSC_HAVE_SSE)
107: EXTERN int  Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
108: # endif
109: #endif

111: EXTERN int  Kernel_A_gets_inverse_A_5(MatScalar *);
112: EXTERN int  Kernel_A_gets_inverse_A_6(MatScalar *);
113: EXTERN int  Kernel_A_gets_inverse_A_7(MatScalar *);


116: /*
117:     A = inv(A)    A_gets_inverse_A

119:    A      - square bs by bs array stored in column major order
120:    pivots - integer work array of length bs
121:    W      -  bs by 1 work array
122: */
123: #define Kernel_A_gets_inverse_A(bs,A,pivots,W)
124: { 
125:   LINPACKdgefa((A),(bs),(pivots)); 
126:   LINPACKdgedi((A),(bs),(pivots),(W)); 
127: }

129: /* -----------------------------------------------------------------------*/

131: #if !defined(PETSC_USE_MAT_SINGLE)
132: /*
133:         Version that calls the BLAS directly
134: */
135: /*
136:       A = A * B   A_gets_A_times_B

138:    A, B - square bs by bs arrays stored in column major order
139:    W    - square bs by bs work array

141: */
142: #define Kernel_A_gets_A_times_B(bs,A,B,W) 
143: { 
144:   PetscScalar _one = 1.0,_zero = 0.0; 
145:   int    _ierr; 
146:   _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); 
147:   BLgemm_("N","N",&(bs),&(bs),&(bs),&_one,(W),&(bs),(B),&(bs),&_zero,(A),&(bs));
148: }

150: /*

152:     A = A - B * C  A_gets_A_minus_B_times_C 

154:    A, B, C - square bs by bs arrays stored in column major order
155: */
156: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) 
157: { 
158:   PetscScalar _mone = -1.0,_one = 1.0; 
159:   BLgemm_("N","N",&(bs),&(bs),&(bs),&_mone,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
160: }

162: /*

164:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C 

166:    A, B, C - square bs by bs arrays stored in column major order
167: */
168: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) 
169: { 
170:   PetscScalar _one = 1.0; 
171:   BLgemm_("T","N",&(bs),&(bs),&(bs),&_one,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
172: }

174: /*
175:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

177:    v - array of length bs
178:    A - square bs by bs array
179:    w - array of length bs
180: */
181: #define  Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) 
182: {  
183:   PetscScalar _one = 1.0; 
184:   int    _ione = 1; 
185:   LAgemv_("T",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); 
186: } 

188: /*
189:     v = v - A w  v_gets_v_minus_A_times_w

191:    v - array of length bs
192:    A - square bs by bs array
193:    w - array of length bs
194: */
195: #define  Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) 
196: {  
197:   PetscScalar _mone = -1.0,_one = 1.0; 
198:   int    _ione = 1; 
199:   LAgemv_("N",&(bs),&(bs),&_mone,A,&(bs),w,&_ione,&_one,v,&_ione); 
200: }

202: /*
203:     v = v + A w  v_gets_v_plus_A_times_w

205:    v - array of length bs
206:    A - square bs by bs array
207:    w - array of length bs
208: */
209: #define  Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) 
210: {  
211:   PetscScalar _one = 1.0; 
212:   int    _ione = 1; 
213:   LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); 
214: }

216: /*
217:     v = v + A w  v_gets_v_plus_Ar_times_w

219:    v - array of length bs
220:    A - square bs by bs array
221:    w - array of length bs
222: */
223: #define  Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) 
224: {  
225:   PetscScalar _one = 1.0; 
226:   int    _ione = 1; 
227:   LAgemv_("N",&(bs),&(ncols),&_one,A,&(bs),v,&_ione,&_one,w,&_ione); 
228: }

230: /*
231:     w = A v   w_gets_A_times_v

233:    v - array of length bs
234:    A - square bs by bs array
235:    w - array of length bs
236: */
237: #define Kernel_w_gets_A_times_v(bs,v,A,w) 
238: {  
239:   PetscScalar _zero = 0.0,_one = 1.0; 
240:   int    _ione = 1; 
241:   LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),v,&_ione,&_zero,w,&_ione); 
242: }

244: /*
245:         z = A*x
246: */
247: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) 
248: { 
249:   PetscScalar _one = 1.0,_zero = 0.0; 
250:   int    _ione = 1; 
251:   LAgemv_("N",&bs,&ncols,&_one,A,&bs,x,&_ione,&_zero,z,&_ione); 
252: }

254: /*
255:         z = trans(A)*x
256: */
257: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) 
258: { 
259:   PetscScalar _one = 1.0; 
260:   int    _ione = 1; 
261:   LAgemv_("T",&bs,&ncols,&_one,A,&bs,x,&_ione,&_one,z,&_ione); 
262: }

264: #else 
265: /*
266:        Version that calls Fortran routines; can handle different precision
267:    of matrix (array) and vectors
268: */
269: /*
270:      These are Fortran kernels: They replace certain BLAS routines but
271:    have some arguments that may be single precision,rather than double
272:    These routines are provided in src/fortran/kernels/sgemv.F 
273:    They are pretty pitiful but get the job done. The intention is 
274:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined 
275:    code is used.
276: */
277: #ifdef PETSC_HAVE_FORTRAN_CAPS
278: #define msgemv_  MSGEMV
279: #define msgemvp_ MSGEMVP
280: #define msgemvm_ MSGEMVM
281: #define msgemvt_ MSGEMVT
282: #define msgemmi_ MSGEMMI
283: #define msgemm_  MSGEMM
284: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
285: #define msgemv_  msgemv
286: #define msgemvp_ msgemvp
287: #define msgemvm_ msgemvm
288: #define msgemvt_ msgemvt
289: #define msgemmi_ msgemmi
290: #define msgemm_  msgemm
291: #endif
292: EXTERN_C_BEGIN
293: EXTERN void msgemv_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
294: EXTERN void msgemvp_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
295: EXTERN void msgemvm_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
296: EXTERN void msgemvt_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
297: EXTERN void msgemmi_(int *,MatScalar*,MatScalar*,MatScalar*);
298: EXTERN void msgemm_(int *,MatScalar*,MatScalar*,MatScalar*);
299: EXTERN_C_END

301: /*
302:       A = A * B   A_gets_A_times_B

304:    A, B - square bs by bs arrays stored in column major order
305:    W    - square bs by bs work array

307: */
308: #define Kernel_A_gets_A_times_B(bs,A,B,W) 
309: { 
310:   int _ierr; 
311:   _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); 
312:   msgemmi_(&bs,A,B,W); 
313: }

315: /*

317:     A = A - B * C  A_gets_A_minus_B_times_C 

319:    A, B, C - square bs by bs arrays stored in column major order
320: */
321: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) 
322: { 
323:   msgemm_(&bs,A,B,C); 
324: }

326: /*
327:     v = v - A w  v_gets_v_minus_A_times_w

329:    v - array of length bs
330:    A - square bs by bs array
331:    w - array of length bs
332: */
333: #define  Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) 
334: {  
335:   msgemvm_(&bs,&bs,A,w,v); 
336: }

338: /*
339:     v = v + A w  v_gets_v_plus_A_times_w

341:    v - array of length bs
342:    A - square bs by bs array
343:    w - array of length bs
344: */
345: #define  Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) 
346: {  
347:   msgemvp_(&bs,&bs,A,w,v);
348: }

350: /*
351:     v = v + A w  v_gets_v_plus_Ar_times_w

353:    v - array of length bs
354:    A - square bs by bs array
355:    w - array of length bs
356: */
357: #define  Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) 
358: {  
359:   msgemvp_(&bs,&ncol,A,v,w);
360: }

362: /*
363:     w = A v   w_gets_A_times_v

365:    v - array of length bs
366:    A - square bs by bs array
367:    w - array of length bs
368: */
369: #define Kernel_w_gets_A_times_v(bs,v,A,w) 
370: {  
371:   msgemv_(&bs,&bs,A,v,w);
372: }
373: 
374: /*
375:         z = A*x
376: */
377: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) 
378: { 
379:   msgemv_(&bs,&ncols,A,x,z);
380: }

382: /*
383:         z = trans(A)*x
384: */
385: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) 
386: { 
387:   msgemvt_(&bs,&ncols,A,x,z);
388: }

390: /* These do not work yet */
391: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) 
392: #define  Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) 


395: #endif

397: #endif