Actual source code: sbaijfact.c
1: /*$Id: sbaijfact.c,v 1.61 2001/08/06 21:15:47 bsmith Exp $*/
2: /* Using Modified Sparse Row (MSR) storage.
3: See page 85, "Iterative Methods ..." by Saad. */
5: /*
6: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7: */
8: #include src/mat/impls/baij/seq/baij.h
9: #include src/mat/impls/sbaij/seq/sbaij.h
10: #include src/vec/vecimpl.h
11: #include src/inline/ilu.h
12: #include include/petscis.h
14: int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
15: {
16: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
17: int *rip,ierr,i,mbs = a->mbs,*ai,*aj;
18: int *jutmp,bs = a->bs,bs2=a->bs2;
19: int m,nzi,realloc = 0;
20: int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
21: PetscTruth perm_identity;
25: /* check whether perm is the identity mapping */
26: ISIdentity(perm,&perm_identity);
27: if (!perm_identity) a->permute = PETSC_TRUE;
28:
29: ISGetIndices(perm,&rip);
30:
31: if (perm_identity){ /* without permutation */
32: ai = a->i; aj = a->j;
33: } else { /* non-trivial permutation */
34: MatReorderingSeqSBAIJ(A,perm);
35: ai = a->inew; aj = a->jnew;
36: }
37:
38: /* initialization */
39: /* Don't know how many column pointers are needed so estimate.
40: Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
41: ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);
42: umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
43: ierr = PetscMalloc(umax*sizeof(int),&ju);
44: iu[0] = mbs+1;
45: juptr = mbs;
46: ierr = PetscMalloc(mbs*sizeof(int),&jl);
47: ierr = PetscMalloc(mbs*sizeof(int),&q);
48: for (i=0; i<mbs; i++){
49: jl[i] = mbs; q[i] = 0;
50: }
52: /* for each row k */
53: for (k=0; k<mbs; k++){
54: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
55: q[k] = mbs;
56: /* initialize nonzero structure of k-th row to row rip[k] of A */
57: jmin = ai[rip[k]];
58: jmax = ai[rip[k]+1];
59: for (j=jmin; j<jmax; j++){
60: vj = rip[aj[j]]; /* col. value */
61: if(vj > k){
62: qm = k;
63: do {
64: m = qm; qm = q[m];
65: } while(qm < vj);
66: if (qm == vj) {
67: printf(" error: duplicate entry in An"); break;
68: }
69: nzk++;
70: q[m] = vj;
71: q[vj] = qm;
72: } /* if(vj > k) */
73: } /* for (j=jmin; j<jmax; j++) */
75: /* modify nonzero structure of k-th row by computing fill-in
76: for each row i to be merged in */
77: i = k;
78: i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
79: /* printf(" next pivot row i=%dn",i); */
80: while (i < mbs){
81: /* merge row i into k-th row */
82: nzi = iu[i+1] - (iu[i]+1);
83: jmin = iu[i] + 1; jmax = iu[i] + nzi;
84: qm = k;
85: for (j=jmin; j<jmax+1; j++){
86: vj = ju[j];
87: do {
88: m = qm; qm = q[m];
89: } while (qm < vj);
90: if (qm != vj){
91: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
92: }
93: }
94: i = jl[i]; /* next pivot row */
95: }
96:
97: /* add k to row list for first nonzero element in k-th row */
98: if (nzk > 0){
99: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
100: jl[k] = jl[i]; jl[i] = k;
101: }
102: iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%dn", k+1, iu[k+1],umax);*/
104: /* allocate more space to ju if needed */
105: if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%dn",k+1, iu[k+1],umax);
106: /* estimate how much additional space we will need */
107: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
108: /* just double the memory each time */
109: maxadd = umax;
110: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
111: umax += maxadd;
113: /* allocate a longer ju */
114: PetscMalloc(umax*sizeof(int),&jutmp);
115: ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
116: ierr = PetscFree(ju);
117: ju = jutmp;
118: realloc++; /* count how many times we realloc */
119: }
121: /* save nonzero structure of k-th row in ju */
122: i=k;
123: jumin = juptr + 1; juptr += nzk;
124: for (j=jumin; j<juptr+1; j++){
125: i=q[i];
126: ju[j]=i;
127: }
128: }
130: if (ai[mbs] != 0) {
131: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
132: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
133: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use n",af);
134: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);n",af);
135: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.n");
136: } else {
137: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.n");
138: }
140: ISRestoreIndices(perm,&rip);
141: PetscFree(q);
142: PetscFree(jl);
144: /* put together the new matrix */
145: MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
146: /* PetscLogObjectParent(*B,iperm); */
147: b = (Mat_SeqSBAIJ*)(*B)->data;
148: PetscFree(b->imax);
149: b->singlemalloc = PETSC_FALSE;
150: /* the next line frees the default space generated by the Create() */
151: PetscFree(b->a);
152: PetscFree(b->ilen);
153: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
154: b->j = ju;
155: b->i = iu;
156: b->diag = 0;
157: b->ilen = 0;
158: b->imax = 0;
159: b->row = perm;
160: ierr = PetscObjectReference((PetscObject)perm);
161: b->icol = perm;
162: ierr = PetscObjectReference((PetscObject)perm);
163: ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
164: /* In b structure: Free imax, ilen, old a, old j.
165: Allocate idnew, solve_work, new a, new j */
166: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
167: b->s_maxnz = b->s_nz = iu[mbs];
168:
169: (*B)->factor = FACTOR_CHOLESKY;
170: (*B)->info.factor_mallocs = realloc;
171: (*B)->info.fill_ratio_given = f;
172: if (ai[mbs] != 0) {
173: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
174: } else {
175: (*B)->info.fill_ratio_needed = 0.0;
176: }
178: if (perm_identity){
179: switch (bs) {
180: case 1:
181: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
182: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
183: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1n");
184: break;
185: case 2:
186: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
187: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
188: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2n");
189: break;
190: case 3:
191: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
192: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
193: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3n");
194: break;
195: case 4:
196: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
197: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
198: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4n");
199: break;
200: case 5:
201: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
202: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
203: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5n");
204: break;
205: case 6:
206: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
207: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
208: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6n");
209: break;
210: case 7:
211: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
212: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
213: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7n");
214: break;
215: default:
216: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
217: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
218: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7n");
219: break;
220: }
221: }
223: return(0);
224: }
227: int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
228: {
229: Mat C = *B;
230: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
231: IS perm = b->row;
232: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
233: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
234: int bs=a->bs,bs2 = a->bs2;
235: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
236: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
237: MatScalar *work;
238: int *pivots;
242: /* initialization */
243: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
244: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
245: PetscMalloc(2*mbs*sizeof(int),&il);
246: jl = il + mbs;
247: for (i=0; i<mbs; i++) {
248: jl[i] = mbs; il[0] = 0;
249: }
250: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
251: uik = dk + bs2;
252: work = uik + bs2;
253: PetscMalloc(bs*sizeof(int),&pivots);
254:
255: ierr = ISGetIndices(perm,&perm_ptr);
256:
257: /* check permutation */
258: if (!a->permute){
259: ai = a->i; aj = a->j; aa = a->a;
260: } else {
261: ai = a->inew; aj = a->jnew;
262: PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
263: PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
264: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
265: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
267: for (i=0; i<mbs; i++){
268: jmin = ai[i]; jmax = ai[i+1];
269: for (j=jmin; j<jmax; j++){
270: while (a2anew[j] != j){
271: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
272: for (k1=0; k1<bs2; k1++){
273: dk[k1] = aa[k*bs2+k1];
274: aa[k*bs2+k1] = aa[j*bs2+k1];
275: aa[j*bs2+k1] = dk[k1];
276: }
277: }
278: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
279: if (i > aj[j]){
280: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
281: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
282: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
283: for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */
284: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
285: }
286: }
287: }
288: }
289: PetscFree(a2anew);
290: }
291:
292: /* for each row k */
293: for (k = 0; k<mbs; k++){
295: /*initialize k-th row with elements nonzero in row perm(k) of A */
296: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
297: if (jmin < jmax) {
298: ap = aa + jmin*bs2;
299: for (j = jmin; j < jmax; j++){
300: vj = perm_ptr[aj[j]]; /* block col. index */
301: rtmp_ptr = rtmp + vj*bs2;
302: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
303: }
304: }
306: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
307: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
308: i = jl[k]; /* first row to be added to k_th row */
310: while (i < mbs){
311: nexti = jl[i]; /* next row to be added to k_th row */
313: /* compute multiplier */
314: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
316: /* uik = -inv(Di)*U_bar(i,k) */
317: diag = ba + i*bs2;
318: u = ba + ili*bs2;
319: PetscMemzero(uik,bs2*sizeof(MatScalar));
320: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
321:
322: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
323: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
324:
325: /* update -U(i,k) */
326: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
328: /* add multiple of row i to k-th row ... */
329: jmin = ili + 1; jmax = bi[i+1];
330: if (jmin < jmax){
331: for (j=jmin; j<jmax; j++) {
332: /* rtmp += -U(i,k)^T * U_bar(i,j) */
333: rtmp_ptr = rtmp + bj[j]*bs2;
334: u = ba + j*bs2;
335: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
336: }
337:
338: /* ... add i to row list for next nonzero entry */
339: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
340: j = bj[jmin];
341: jl[i] = jl[j]; jl[j] = i; /* update jl */
342: }
343: i = nexti;
344: }
346: /* save nonzero entries in k-th row of U ... */
348: /* invert diagonal block */
349: diag = ba+k*bs2;
350: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
351: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
352:
353: jmin = bi[k]; jmax = bi[k+1];
354: if (jmin < jmax) {
355: for (j=jmin; j<jmax; j++){
356: vj = bj[j]; /* block col. index of U */
357: u = ba + j*bs2;
358: rtmp_ptr = rtmp + vj*bs2;
359: for (k1=0; k1<bs2; k1++){
360: *u++ = *rtmp_ptr;
361: *rtmp_ptr++ = 0.0;
362: }
363: }
364:
365: /* ... add k to row list for first nonzero entry in k-th row */
366: il[k] = jmin;
367: i = bj[jmin];
368: jl[k] = jl[i]; jl[i] = k;
369: }
370: }
372: PetscFree(rtmp);
373: PetscFree(il);
374: PetscFree(dk);
375: PetscFree(pivots);
376: if (a->permute){
377: PetscFree(aa);
378: }
380: ISRestoreIndices(perm,&perm_ptr);
381: C->factor = FACTOR_CHOLESKY;
382: C->assembled = PETSC_TRUE;
383: C->preallocated = PETSC_TRUE;
384: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
385: return(0);
386: }
388: int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
389: {
390: Mat C = *B;
391: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
392: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
393: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
394: int bs=a->bs,bs2 = a->bs2;
395: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
396: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
397: MatScalar *work;
398: int *pivots;
402: /* initialization */
403:
404: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
405: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
406: PetscMalloc(2*mbs*sizeof(int),&il);
407: jl = il + mbs;
408: for (i=0; i<mbs; i++) {
409: jl[i] = mbs; il[0] = 0;
410: }
411: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
412: uik = dk + bs2;
413: work = uik + bs2;
414: PetscMalloc(bs*sizeof(int),&pivots);
415:
416: ai = a->i; aj = a->j; aa = a->a;
417:
418: /* for each row k */
419: for (k = 0; k<mbs; k++){
421: /*initialize k-th row with elements nonzero in row k of A */
422: jmin = ai[k]; jmax = ai[k+1];
423: if (jmin < jmax) {
424: ap = aa + jmin*bs2;
425: for (j = jmin; j < jmax; j++){
426: vj = aj[j]; /* block col. index */
427: rtmp_ptr = rtmp + vj*bs2;
428: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
429: }
430: }
432: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
433: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
434: i = jl[k]; /* first row to be added to k_th row */
436: while (i < mbs){
437: nexti = jl[i]; /* next row to be added to k_th row */
439: /* compute multiplier */
440: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
442: /* uik = -inv(Di)*U_bar(i,k) */
443: diag = ba + i*bs2;
444: u = ba + ili*bs2;
445: PetscMemzero(uik,bs2*sizeof(MatScalar));
446: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
447:
448: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
449: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
450:
451: /* update -U(i,k) */
452: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
454: /* add multiple of row i to k-th row ... */
455: jmin = ili + 1; jmax = bi[i+1];
456: if (jmin < jmax){
457: for (j=jmin; j<jmax; j++) {
458: /* rtmp += -U(i,k)^T * U_bar(i,j) */
459: rtmp_ptr = rtmp + bj[j]*bs2;
460: u = ba + j*bs2;
461: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
462: }
463:
464: /* ... add i to row list for next nonzero entry */
465: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
466: j = bj[jmin];
467: jl[i] = jl[j]; jl[j] = i; /* update jl */
468: }
469: i = nexti;
470: }
472: /* save nonzero entries in k-th row of U ... */
474: /* invert diagonal block */
475: diag = ba+k*bs2;
476: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
477: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
478:
479: jmin = bi[k]; jmax = bi[k+1];
480: if (jmin < jmax) {
481: for (j=jmin; j<jmax; j++){
482: vj = bj[j]; /* block col. index of U */
483: u = ba + j*bs2;
484: rtmp_ptr = rtmp + vj*bs2;
485: for (k1=0; k1<bs2; k1++){
486: *u++ = *rtmp_ptr;
487: *rtmp_ptr++ = 0.0;
488: }
489: }
490:
491: /* ... add k to row list for first nonzero entry in k-th row */
492: il[k] = jmin;
493: i = bj[jmin];
494: jl[k] = jl[i]; jl[i] = k;
495: }
496: }
498: PetscFree(rtmp);
499: PetscFree(il);
500: PetscFree(dk);
501: PetscFree(pivots);
503: C->factor = FACTOR_CHOLESKY;
504: C->assembled = PETSC_TRUE;
505: C->preallocated = PETSC_TRUE;
506: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
507: return(0);
508: }
510: /*
511: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
512: Version for blocks 2 by 2.
513: */
514: int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
515: {
516: Mat C = *B;
517: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
518: IS perm = b->row;
519: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
520: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
521: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
522: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
525:
526: /* initialization */
527: /* il and jl record the first nonzero element in each row of the accessing
528: window U(0:k, k:mbs-1).
529: jl: list of rows to be added to uneliminated rows
530: i>= k: jl(i) is the first row to be added to row i
531: i< k: jl(i) is the row following row i in some list of rows
532: jl(i) = mbs indicates the end of a list
533: il(i): points to the first nonzero element in columns k,...,mbs-1 of
534: row i of U */
535: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
536: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
537: PetscMalloc(2*mbs*sizeof(int),&il);
538: jl = il + mbs;
539: for (i=0; i<mbs; i++) {
540: jl[i] = mbs; il[0] = 0;
541: }
542: PetscMalloc(8*sizeof(MatScalar),&dk);
543: uik = dk + 4;
544: ISGetIndices(perm,&perm_ptr);
546: /* check permutation */
547: if (!a->permute){
548: ai = a->i; aj = a->j; aa = a->a;
549: } else {
550: ai = a->inew; aj = a->jnew;
551: PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
552: PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
553: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
554: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
556: for (i=0; i<mbs; i++){
557: jmin = ai[i]; jmax = ai[i+1];
558: for (j=jmin; j<jmax; j++){
559: while (a2anew[j] != j){
560: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
561: for (k1=0; k1<4; k1++){
562: dk[k1] = aa[k*4+k1];
563: aa[k*4+k1] = aa[j*4+k1];
564: aa[j*4+k1] = dk[k1];
565: }
566: }
567: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
568: if (i > aj[j]){
569: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
570: ap = aa + j*4; /* ptr to the beginning of the block */
571: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
572: ap[1] = ap[2];
573: ap[2] = dk[1];
574: }
575: }
576: }
577: PetscFree(a2anew);
578: }
580: /* for each row k */
581: for (k = 0; k<mbs; k++){
583: /*initialize k-th row with elements nonzero in row perm(k) of A */
584: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
585: if (jmin < jmax) {
586: ap = aa + jmin*4;
587: for (j = jmin; j < jmax; j++){
588: vj = perm_ptr[aj[j]]; /* block col. index */
589: rtmp_ptr = rtmp + vj*4;
590: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
591: }
592: }
594: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
595: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
596: i = jl[k]; /* first row to be added to k_th row */
598: while (i < mbs){
599: nexti = jl[i]; /* next row to be added to k_th row */
601: /* compute multiplier */
602: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
604: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
605: diag = ba + i*4;
606: u = ba + ili*4;
607: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
608: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
609: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
610: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
611:
612: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
613: dk[0] += uik[0]*u[0] + uik[1]*u[1];
614: dk[1] += uik[2]*u[0] + uik[3]*u[1];
615: dk[2] += uik[0]*u[2] + uik[1]*u[3];
616: dk[3] += uik[2]*u[2] + uik[3]*u[3];
618: /* update -U(i,k): ba[ili] = uik */
619: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
621: /* add multiple of row i to k-th row ... */
622: jmin = ili + 1; jmax = bi[i+1];
623: if (jmin < jmax){
624: for (j=jmin; j<jmax; j++) {
625: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
626: rtmp_ptr = rtmp + bj[j]*4;
627: u = ba + j*4;
628: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
629: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
630: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
631: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
632: }
633:
634: /* ... add i to row list for next nonzero entry */
635: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
636: j = bj[jmin];
637: jl[i] = jl[j]; jl[j] = i; /* update jl */
638: }
639: i = nexti;
640: }
642: /* save nonzero entries in k-th row of U ... */
644: /* invert diagonal block */
645: diag = ba+k*4;
646: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
647: Kernel_A_gets_inverse_A_2(diag);
648:
649: jmin = bi[k]; jmax = bi[k+1];
650: if (jmin < jmax) {
651: for (j=jmin; j<jmax; j++){
652: vj = bj[j]; /* block col. index of U */
653: u = ba + j*4;
654: rtmp_ptr = rtmp + vj*4;
655: for (k1=0; k1<4; k1++){
656: *u++ = *rtmp_ptr;
657: *rtmp_ptr++ = 0.0;
658: }
659: }
660:
661: /* ... add k to row list for first nonzero entry in k-th row */
662: il[k] = jmin;
663: i = bj[jmin];
664: jl[k] = jl[i]; jl[i] = k;
665: }
666: }
668: PetscFree(rtmp);
669: PetscFree(il);
670: PetscFree(dk);
671: if (a->permute) {
672: PetscFree(aa);
673: }
674: ISRestoreIndices(perm,&perm_ptr);
675: C->factor = FACTOR_CHOLESKY;
676: C->assembled = PETSC_TRUE;
677: C->preallocated = PETSC_TRUE;
678: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
679: return(0);
680: }
682: /*
683: Version for when blocks are 2 by 2 Using natural ordering
684: */
685: int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
686: {
687: Mat C = *B;
688: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
689: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
690: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
691: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
692: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
695:
696: /* initialization */
697: /* il and jl record the first nonzero element in each row of the accessing
698: window U(0:k, k:mbs-1).
699: jl: list of rows to be added to uneliminated rows
700: i>= k: jl(i) is the first row to be added to row i
701: i< k: jl(i) is the row following row i in some list of rows
702: jl(i) = mbs indicates the end of a list
703: il(i): points to the first nonzero element in columns k,...,mbs-1 of
704: row i of U */
705: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
706: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
707: PetscMalloc(2*mbs*sizeof(int),&il);
708: jl = il + mbs;
709: for (i=0; i<mbs; i++) {
710: jl[i] = mbs; il[0] = 0;
711: }
712: PetscMalloc(8*sizeof(MatScalar),&dk);
713: uik = dk + 4;
714:
715: ai = a->i; aj = a->j; aa = a->a;
717: /* for each row k */
718: for (k = 0; k<mbs; k++){
720: /*initialize k-th row with elements nonzero in row k of A */
721: jmin = ai[k]; jmax = ai[k+1];
722: if (jmin < jmax) {
723: ap = aa + jmin*4;
724: for (j = jmin; j < jmax; j++){
725: vj = aj[j]; /* block col. index */
726: rtmp_ptr = rtmp + vj*4;
727: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
728: }
729: }
731: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
732: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
733: i = jl[k]; /* first row to be added to k_th row */
735: while (i < mbs){
736: nexti = jl[i]; /* next row to be added to k_th row */
738: /* compute multiplier */
739: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
741: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
742: diag = ba + i*4;
743: u = ba + ili*4;
744: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
745: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
746: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
747: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
748:
749: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
750: dk[0] += uik[0]*u[0] + uik[1]*u[1];
751: dk[1] += uik[2]*u[0] + uik[3]*u[1];
752: dk[2] += uik[0]*u[2] + uik[1]*u[3];
753: dk[3] += uik[2]*u[2] + uik[3]*u[3];
755: /* update -U(i,k): ba[ili] = uik */
756: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
758: /* add multiple of row i to k-th row ... */
759: jmin = ili + 1; jmax = bi[i+1];
760: if (jmin < jmax){
761: for (j=jmin; j<jmax; j++) {
762: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
763: rtmp_ptr = rtmp + bj[j]*4;
764: u = ba + j*4;
765: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
766: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
767: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
768: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
769: }
770:
771: /* ... add i to row list for next nonzero entry */
772: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
773: j = bj[jmin];
774: jl[i] = jl[j]; jl[j] = i; /* update jl */
775: }
776: i = nexti;
777: }
779: /* save nonzero entries in k-th row of U ... */
781: /* invert diagonal block */
782: diag = ba+k*4;
783: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
784: Kernel_A_gets_inverse_A_2(diag);
785:
786: jmin = bi[k]; jmax = bi[k+1];
787: if (jmin < jmax) {
788: for (j=jmin; j<jmax; j++){
789: vj = bj[j]; /* block col. index of U */
790: u = ba + j*4;
791: rtmp_ptr = rtmp + vj*4;
792: for (k1=0; k1<4; k1++){
793: *u++ = *rtmp_ptr;
794: *rtmp_ptr++ = 0.0;
795: }
796: }
797:
798: /* ... add k to row list for first nonzero entry in k-th row */
799: il[k] = jmin;
800: i = bj[jmin];
801: jl[k] = jl[i]; jl[i] = k;
802: }
803: }
805: PetscFree(rtmp);
806: PetscFree(il);
807: PetscFree(dk);
809: C->factor = FACTOR_CHOLESKY;
810: C->assembled = PETSC_TRUE;
811: C->preallocated = PETSC_TRUE;
812: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
813: return(0);
814: }
816: /*
817: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
818: Version for blocks are 1 by 1.
819: */
820: int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
821: {
822: Mat C = *B;
823: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
824: IS ip = b->row;
825: int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
826: int *ai,*aj,*r;
827: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
828: MatScalar *rtmp;
829: MatScalar *ba = b->a,*aa,ak;
830: MatScalar dk,uikdi;
833: ierr = ISGetIndices(ip,&rip);
834: if (!a->permute){
835: ai = a->i; aj = a->j; aa = a->a;
836: } else {
837: ai = a->inew; aj = a->jnew;
838: PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);
839: PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));
840: PetscMalloc(ai[mbs]*sizeof(int),&r);
841: ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));
843: jmin = ai[0]; jmax = ai[mbs];
844: for (j=jmin; j<jmax; j++){
845: while (r[j] != j){
846: k = r[j]; r[j] = r[k]; r[k] = k;
847: ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
848: }
849: }
850: PetscFree(r);
851: }
852:
853: /* initialization */
854: /* il and jl record the first nonzero element in each row of the accessing
855: window U(0:k, k:mbs-1).
856: jl: list of rows to be added to uneliminated rows
857: i>= k: jl(i) is the first row to be added to row i
858: i< k: jl(i) is the row following row i in some list of rows
859: jl(i) = mbs indicates the end of a list
860: il(i): points to the first nonzero element in columns k,...,mbs-1 of
861: row i of U */
862: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
863: PetscMalloc(2*mbs*sizeof(int),&il);
864: jl = il + mbs;
865: for (i=0; i<mbs; i++) {
866: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
867: }
869: /* for each row k */
870: for (k = 0; k<mbs; k++){
872: /*initialize k-th row with elements nonzero in row perm(k) of A */
873: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
874: if (jmin < jmax) {
875: for (j = jmin; j < jmax; j++){
876: vj = rip[aj[j]];
877: rtmp[vj] = aa[j];
878: }
879: }
881: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
882: dk = rtmp[k];
883: i = jl[k]; /* first row to be added to k_th row */
885: while (i < mbs){
886: nexti = jl[i]; /* next row to be added to k_th row */
888: /* compute multiplier, update D(k) and U(i,k) */
889: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
890: uikdi = - ba[ili]*ba[i];
891: dk += uikdi*ba[ili];
892: ba[ili] = uikdi; /* -U(i,k) */
894: /* add multiple of row i to k-th row ... */
895: jmin = ili + 1; jmax = bi[i+1];
896: if (jmin < jmax){
897: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
898: /* ... add i to row list for next nonzero entry */
899: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
900: j = bj[jmin];
901: jl[i] = jl[j]; jl[j] = i; /* update jl */
902: }
903: i = nexti;
904: }
906: /* check for zero pivot and save diagoanl element */
907: if (dk == 0.0){
908: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
909: /*
910: }else if (PetscRealPart(dk) < 0){
911: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
912: */
913: }
915: /* save nonzero entries in k-th row of U ... */
916: ba[k] = 1.0/dk;
917: jmin = bi[k]; jmax = bi[k+1];
918: if (jmin < jmax) {
919: for (j=jmin; j<jmax; j++){
920: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
921: }
922: /* ... add k to row list for first nonzero entry in k-th row */
923: il[k] = jmin;
924: i = bj[jmin];
925: jl[k] = jl[i]; jl[i] = k;
926: }
927: }
928:
929: PetscFree(rtmp);
930: PetscFree(il);
931: if (a->permute){
932: PetscFree(aa);
933: }
935: ISRestoreIndices(ip,&rip);
936: C->factor = FACTOR_CHOLESKY;
937: C->assembled = PETSC_TRUE;
938: C->preallocated = PETSC_TRUE;
939: PetscLogFlops(b->mbs);
940: return(0);
941: }
943: /*
944: Version for when blocks are 1 by 1 Using natural ordering
945: */
946: int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
947: {
948: Mat C = *B;
949: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
950: int ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
951: int *ai,*aj;
952: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
953: MatScalar *rtmp,*ba = b->a,*aa,dk,uikdi;
956:
957: /* initialization */
958: /* il and jl record the first nonzero element in each row of the accessing
959: window U(0:k, k:mbs-1).
960: jl: list of rows to be added to uneliminated rows
961: i>= k: jl(i) is the first row to be added to row i
962: i< k: jl(i) is the row following row i in some list of rows
963: jl(i) = mbs indicates the end of a list
964: il(i): points to the first nonzero element in columns k,...,mbs-1 of
965: row i of U */
966: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
967: PetscMalloc(2*mbs*sizeof(int),&il);
968: jl = il + mbs;
969: for (i=0; i<mbs; i++) {
970: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
971: }
973: ai = a->i; aj = a->j; aa = a->a;
975: /* for each row k */
976: for (k = 0; k<mbs; k++){
978: /*initialize k-th row with elements nonzero in row perm(k) of A */
979: jmin = ai[k]; jmax = ai[k+1];
980: if (jmin < jmax) {
981: for (j = jmin; j < jmax; j++){
982: vj = aj[j];
983: rtmp[vj] = aa[j];
984: }
985: }
987: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
988: dk = rtmp[k];
989: i = jl[k]; /* first row to be added to k_th row */
991: while (i < mbs){
992: nexti = jl[i]; /* next row to be added to k_th row */
994: /* compute multiplier, update D(k) and U(i,k) */
995: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
996: uikdi = - ba[ili]*ba[i];
997: dk += uikdi*ba[ili];
998: ba[ili] = uikdi; /* -U(i,k) */
1000: /* add multiple of row i to k-th row ... */
1001: jmin = ili + 1; jmax = bi[i+1];
1002: if (jmin < jmax){
1003: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1004: /* ... add i to row list for next nonzero entry */
1005: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1006: j = bj[jmin];
1007: jl[i] = jl[j]; jl[j] = i; /* update jl */
1008: }
1009: i = nexti;
1010: }
1012: /* check for zero pivot and save diagoanl element */
1013: if (dk == 0.0){
1014: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1015: /*
1016: }else if (PetscRealPart(dk) < 0){
1017: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
1018: */
1019: }
1021: /* save nonzero entries in k-th row of U ... */
1022: ba[k] = 1.0/dk;
1023: jmin = bi[k]; jmax = bi[k+1];
1024: if (jmin < jmax) {
1025: for (j=jmin; j<jmax; j++){
1026: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1027: }
1028: /* ... add k to row list for first nonzero entry in k-th row */
1029: il[k] = jmin;
1030: i = bj[jmin];
1031: jl[k] = jl[i]; jl[i] = k;
1032: }
1033: }
1034:
1035: PetscFree(rtmp);
1036: PetscFree(il);
1037:
1038: C->factor = FACTOR_CHOLESKY;
1039: C->assembled = PETSC_TRUE;
1040: C->preallocated = PETSC_TRUE;
1041: PetscLogFlops(b->mbs);
1042: return(0);
1043: }
1045: int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
1046: {
1048: Mat C;
1051: MatCholeskyFactorSymbolic(A,perm,f,&C);
1052: MatCholeskyFactorNumeric(A,&C);
1053: MatHeaderCopy(A,C);
1054: return(0);
1055: }