Actual source code: sbaijfact.c
1: /*$Id: sbaijfact.c,v 1.59 2001/04/10 22:35:39 balay Exp $*/
2: /* Using Modified Sparse Row (MSR) storage.
3: See page 85, "Iterative Methods ..." by Saad. */
5: /*
6: Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
7: */
8: #include sbaij.h
9: #include src/mat/impls/baij/seq/baij.h
10: #include src/vec/vecimpl.h
11: #include src/inline/ilu.h
12: #include include/petscis.h
14: int MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,Mat *B)
15: {
16: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
17: int *rip,ierr,i,mbs = a->mbs,*ai,*aj;
18: int *jutmp,bs = a->bs,bs2=a->bs2;
19: int m,nzi,realloc = 0;
20: int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
21: PetscTruth perm_identity;
24: if (!perm) {
25: ISCreateStride(PETSC_COMM_SELF,mbs,0,1,&perm);
26: }
29: /* check whether perm is the identity mapping */
30: ISIdentity(perm,&perm_identity);
31: if (!perm_identity) a->permute = PETSC_TRUE;
32:
33: ISGetIndices(perm,&rip);
34:
35: if (perm_identity){ /* without permutation */
36: ai = a->i; aj = a->j;
37: } else { /* non-trivial permutation */
38: MatReorderingSeqSBAIJ(A, perm);
39: ai = a->inew; aj = a->jnew;
40: }
41:
42: /* initialization */
43: /* Don't know how many column pointers are needed so estimate.
44: Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
45: ierr = PetscMalloc((mbs+1)*sizeof(int),&iu);
46: umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
47: ierr = PetscMalloc(umax*sizeof(int),&ju);
48: iu[0] = mbs+1;
49: juptr = mbs;
50: ierr = PetscMalloc(mbs*sizeof(int),&jl);
51: ierr = PetscMalloc(mbs*sizeof(int),&q);
52: for (i=0; i<mbs; i++){
53: jl[i] = mbs; q[i] = 0;
54: }
56: /* for each row k */
57: for (k=0; k<mbs; k++){
58: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
59: q[k] = mbs;
60: /* initialize nonzero structure of k-th row to row rip[k] of A */
61: jmin = ai[rip[k]];
62: jmax = ai[rip[k]+1];
63: for (j=jmin; j<jmax; j++){
64: vj = rip[aj[j]]; /* col. value */
65: if(vj > k){
66: qm = k;
67: do {
68: m = qm; qm = q[m];
69: } while(qm < vj);
70: if (qm == vj) {
71: printf(" error: duplicate entry in An"); break;
72: }
73: nzk++;
74: q[m] = vj;
75: q[vj] = qm;
76: } /* if(vj > k) */
77: } /* for (j=jmin; j<jmax; j++) */
79: /* modify nonzero structure of k-th row by computing fill-in
80: for each row i to be merged in */
81: i = k;
82: i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
83: /* printf(" next pivot row i=%dn",i); */
84: while (i < mbs){
85: /* merge row i into k-th row */
86: nzi = iu[i+1] - (iu[i]+1);
87: jmin = iu[i] + 1; jmax = iu[i] + nzi;
88: qm = k;
89: for (j=jmin; j<jmax+1; j++){
90: vj = ju[j];
91: do {
92: m = qm; qm = q[m];
93: } while (qm < vj);
94: if (qm != vj){
95: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
96: }
97: }
98: i = jl[i]; /* next pivot row */
99: }
100:
101: /* add k to row list for first nonzero element in k-th row */
102: if (nzk > 0){
103: i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
104: jl[k] = jl[i]; jl[i] = k;
105: }
106: iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%dn", k+1, iu[k+1],umax);*/
108: /* allocate more space to ju if needed */
109: if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%dn",k+1, iu[k+1],umax);
110: /* estimate how much additional space we will need */
111: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
112: /* just double the memory each time */
113: maxadd = umax;
114: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
115: umax += maxadd;
117: /* allocate a longer ju */
118: PetscMalloc(umax*sizeof(int),&jutmp);
119: ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
120: ierr = PetscFree(ju);
121: ju = jutmp;
122: realloc++; /* count how many times we realloc */
123: }
125: /* save nonzero structure of k-th row in ju */
126: i=k;
127: jumin = juptr + 1; juptr += nzk;
128: for (j=jumin; j<juptr+1; j++){
129: i=q[i];
130: ju[j]=i;
131: }
132: }
134: if (ai[mbs] != 0) {
135: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
136: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
137: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use n",af);
138: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);n",af);
139: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.n");
140: } else {
141: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.n");
142: }
144: ISRestoreIndices(perm,&rip);
145: PetscFree(q);
146: PetscFree(jl);
148: /* put together the new matrix */
149: MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
150: /* PetscLogObjectParent(*B,iperm); */
151: b = (Mat_SeqSBAIJ*)(*B)->data;
152: PetscFree(b->imax);
153: b->singlemalloc = PETSC_FALSE;
154: /* the next line frees the default space generated by the Create() */
155: PetscFree(b->a);
156: PetscFree(b->ilen);
157: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
158: b->j = ju;
159: b->i = iu;
160: b->diag = 0;
161: b->ilen = 0;
162: b->imax = 0;
163: b->row = perm;
164: ierr = PetscObjectReference((PetscObject)perm);
165: b->icol = perm;
166: ierr = PetscObjectReference((PetscObject)perm);
167: ierr = PetscMalloc((bs*mbs+bs)*sizeof(Scalar),&b->solve_work);
168: /* In b structure: Free imax, ilen, old a, old j.
169: Allocate idnew, solve_work, new a, new j */
170: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
171: b->s_maxnz = b->s_nz = iu[mbs];
172:
173: (*B)->factor = FACTOR_CHOLESKY;
174: (*B)->info.factor_mallocs = realloc;
175: (*B)->info.fill_ratio_given = f;
176: if (ai[mbs] != 0) {
177: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
178: } else {
179: (*B)->info.fill_ratio_needed = 0.0;
180: }
182: if (perm_identity){
183: switch (bs) {
184: case 1:
185: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
186: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
187: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1n");
188: break;
189: case 2:
190: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
191: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
192: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2n");
193: break;
194: case 3:
195: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
196: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
197: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3n");
198: break;
199: case 4:
200: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
201: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
202: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4n");
203: break;
204: case 5:
205: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
206: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
207: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5n");
208: break;
209: case 6:
210: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
211: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
212: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6n");
213: break;
214: case 7:
215: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
216: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
217: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7n");
218: break;
219: default:
220: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
221: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
222: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7n");
223: break;
224: }
225: }
227: return(0);
228: }
231: int MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
232: {
233: Mat C = *B;
234: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
235: IS perm = b->row;
236: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
237: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
238: int bs=a->bs,bs2 = a->bs2;
239: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
240: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
241: MatScalar *work;
242: int *pivots;
246: /* initialization */
247: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
248: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
249: PetscMalloc(2*mbs*sizeof(int),&il);
250: jl = il + mbs;
251: for (i=0; i<mbs; i++) {
252: jl[i] = mbs; il[0] = 0;
253: }
254: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
255: uik = dk + bs2;
256: work = uik + bs2;
257: PetscMalloc(bs*sizeof(int),&pivots);
258:
259: ierr = ISGetIndices(perm,&perm_ptr);
260:
261: /* check permutation */
262: if (!a->permute){
263: ai = a->i; aj = a->j; aa = a->a;
264: } else {
265: ai = a->inew; aj = a->jnew;
266: PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
267: PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
268: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
269: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
271: for (i=0; i<mbs; i++){
272: jmin = ai[i]; jmax = ai[i+1];
273: for (j=jmin; j<jmax; j++){
274: while (a2anew[j] != j){
275: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
276: for (k1=0; k1<bs2; k1++){
277: dk[k1] = aa[k*bs2+k1];
278: aa[k*bs2+k1] = aa[j*bs2+k1];
279: aa[j*bs2+k1] = dk[k1];
280: }
281: }
282: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
283: if (i > aj[j]){
284: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
285: ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */
286: for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
287: for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */
288: for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
289: }
290: }
291: }
292: }
293: PetscFree(a2anew);
294: }
295:
296: /* for each row k */
297: for (k = 0; k<mbs; k++){
299: /*initialize k-th row with elements nonzero in row perm(k) of A */
300: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
301: if (jmin < jmax) {
302: ap = aa + jmin*bs2;
303: for (j = jmin; j < jmax; j++){
304: vj = perm_ptr[aj[j]]; /* block col. index */
305: rtmp_ptr = rtmp + vj*bs2;
306: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
307: }
308: }
310: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
311: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
312: i = jl[k]; /* first row to be added to k_th row */
314: while (i < mbs){
315: nexti = jl[i]; /* next row to be added to k_th row */
317: /* compute multiplier */
318: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
320: /* uik = -inv(Di)*U_bar(i,k) */
321: diag = ba + i*bs2;
322: u = ba + ili*bs2;
323: PetscMemzero(uik,bs2*sizeof(MatScalar));
324: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
325:
326: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
327: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
328:
329: /* update -U(i,k) */
330: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
332: /* add multiple of row i to k-th row ... */
333: jmin = ili + 1; jmax = bi[i+1];
334: if (jmin < jmax){
335: for (j=jmin; j<jmax; j++) {
336: /* rtmp += -U(i,k)^T * U_bar(i,j) */
337: rtmp_ptr = rtmp + bj[j]*bs2;
338: u = ba + j*bs2;
339: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
340: }
341:
342: /* ... add i to row list for next nonzero entry */
343: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
344: j = bj[jmin];
345: jl[i] = jl[j]; jl[j] = i; /* update jl */
346: }
347: i = nexti;
348: }
350: /* save nonzero entries in k-th row of U ... */
352: /* invert diagonal block */
353: diag = ba+k*bs2;
354: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
355: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
356:
357: jmin = bi[k]; jmax = bi[k+1];
358: if (jmin < jmax) {
359: for (j=jmin; j<jmax; j++){
360: vj = bj[j]; /* block col. index of U */
361: u = ba + j*bs2;
362: rtmp_ptr = rtmp + vj*bs2;
363: for (k1=0; k1<bs2; k1++){
364: *u++ = *rtmp_ptr;
365: *rtmp_ptr++ = 0.0;
366: }
367: }
368:
369: /* ... add k to row list for first nonzero entry in k-th row */
370: il[k] = jmin;
371: i = bj[jmin];
372: jl[k] = jl[i]; jl[i] = k;
373: }
374: }
376: PetscFree(rtmp);
377: PetscFree(il);
378: PetscFree(dk);
379: PetscFree(pivots);
380: if (a->permute){
381: PetscFree(aa);
382: }
384: ISRestoreIndices(perm,&perm_ptr);
385: C->factor = FACTOR_CHOLESKY;
386: C->assembled = PETSC_TRUE;
387: C->preallocated = PETSC_TRUE;
388: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
389: return(0);
390: }
392: int MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,Mat *B)
393: {
394: Mat C = *B;
395: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
396: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
397: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
398: int bs=a->bs,bs2 = a->bs2;
399: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
400: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
401: MatScalar *work;
402: int *pivots;
406: /* initialization */
407:
408: PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
409: PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
410: PetscMalloc(2*mbs*sizeof(int),&il);
411: jl = il + mbs;
412: for (i=0; i<mbs; i++) {
413: jl[i] = mbs; il[0] = 0;
414: }
415: PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
416: uik = dk + bs2;
417: work = uik + bs2;
418: PetscMalloc(bs*sizeof(int),&pivots);
419:
420: ai = a->i; aj = a->j; aa = a->a;
421:
422: /* for each row k */
423: for (k = 0; k<mbs; k++){
425: /*initialize k-th row with elements nonzero in row k of A */
426: jmin = ai[k]; jmax = ai[k+1];
427: if (jmin < jmax) {
428: ap = aa + jmin*bs2;
429: for (j = jmin; j < jmax; j++){
430: vj = aj[j]; /* block col. index */
431: rtmp_ptr = rtmp + vj*bs2;
432: for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
433: }
434: }
436: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
437: PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
438: i = jl[k]; /* first row to be added to k_th row */
440: while (i < mbs){
441: nexti = jl[i]; /* next row to be added to k_th row */
443: /* compute multiplier */
444: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
446: /* uik = -inv(Di)*U_bar(i,k) */
447: diag = ba + i*bs2;
448: u = ba + ili*bs2;
449: PetscMemzero(uik,bs2*sizeof(MatScalar));
450: Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
451:
452: /* update D(k) += -U(i,k)^T * U_bar(i,k) */
453: Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
454:
455: /* update -U(i,k) */
456: PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));
458: /* add multiple of row i to k-th row ... */
459: jmin = ili + 1; jmax = bi[i+1];
460: if (jmin < jmax){
461: for (j=jmin; j<jmax; j++) {
462: /* rtmp += -U(i,k)^T * U_bar(i,j) */
463: rtmp_ptr = rtmp + bj[j]*bs2;
464: u = ba + j*bs2;
465: Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
466: }
467:
468: /* ... add i to row list for next nonzero entry */
469: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
470: j = bj[jmin];
471: jl[i] = jl[j]; jl[j] = i; /* update jl */
472: }
473: i = nexti;
474: }
476: /* save nonzero entries in k-th row of U ... */
478: /* invert diagonal block */
479: diag = ba+k*bs2;
480: PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
481: Kernel_A_gets_inverse_A(bs,diag,pivots,work);
482:
483: jmin = bi[k]; jmax = bi[k+1];
484: if (jmin < jmax) {
485: for (j=jmin; j<jmax; j++){
486: vj = bj[j]; /* block col. index of U */
487: u = ba + j*bs2;
488: rtmp_ptr = rtmp + vj*bs2;
489: for (k1=0; k1<bs2; k1++){
490: *u++ = *rtmp_ptr;
491: *rtmp_ptr++ = 0.0;
492: }
493: }
494:
495: /* ... add k to row list for first nonzero entry in k-th row */
496: il[k] = jmin;
497: i = bj[jmin];
498: jl[k] = jl[i]; jl[i] = k;
499: }
500: }
502: PetscFree(rtmp);
503: PetscFree(il);
504: PetscFree(dk);
505: PetscFree(pivots);
507: C->factor = FACTOR_CHOLESKY;
508: C->assembled = PETSC_TRUE;
509: C->preallocated = PETSC_TRUE;
510: PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
511: return(0);
512: }
514: /*
515: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
516: Version for blocks 2 by 2.
517: */
518: int MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
519: {
520: Mat C = *B;
521: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
522: IS perm = b->row;
523: int *perm_ptr,ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
524: int *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
525: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
526: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
529:
530: /* initialization */
531: /* il and jl record the first nonzero element in each row of the accessing
532: window U(0:k, k:mbs-1).
533: jl: list of rows to be added to uneliminated rows
534: i>= k: jl(i) is the first row to be added to row i
535: i< k: jl(i) is the row following row i in some list of rows
536: jl(i) = mbs indicates the end of a list
537: il(i): points to the first nonzero element in columns k,...,mbs-1 of
538: row i of U */
539: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
540: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
541: PetscMalloc(2*mbs*sizeof(int),&il);
542: jl = il + mbs;
543: for (i=0; i<mbs; i++) {
544: jl[i] = mbs; il[0] = 0;
545: }
546: PetscMalloc(8*sizeof(MatScalar),&dk);
547: uik = dk + 4;
548: ISGetIndices(perm,&perm_ptr);
550: /* check permutation */
551: if (!a->permute){
552: ai = a->i; aj = a->j; aa = a->a;
553: } else {
554: ai = a->inew; aj = a->jnew;
555: PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
556: PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
557: PetscMalloc(ai[mbs]*sizeof(int),&a2anew);
558: PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(int));
560: for (i=0; i<mbs; i++){
561: jmin = ai[i]; jmax = ai[i+1];
562: for (j=jmin; j<jmax; j++){
563: while (a2anew[j] != j){
564: k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
565: for (k1=0; k1<4; k1++){
566: dk[k1] = aa[k*4+k1];
567: aa[k*4+k1] = aa[j*4+k1];
568: aa[j*4+k1] = dk[k1];
569: }
570: }
571: /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
572: if (i > aj[j]){
573: /* printf("change orientation, row: %d, col: %dn",i,aj[j]); */
574: ap = aa + j*4; /* ptr to the beginning of the block */
575: dk[1] = ap[1]; /* swap ap[1] and ap[2] */
576: ap[1] = ap[2];
577: ap[2] = dk[1];
578: }
579: }
580: }
581: PetscFree(a2anew);
582: }
584: /* for each row k */
585: for (k = 0; k<mbs; k++){
587: /*initialize k-th row with elements nonzero in row perm(k) of A */
588: jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
589: if (jmin < jmax) {
590: ap = aa + jmin*4;
591: for (j = jmin; j < jmax; j++){
592: vj = perm_ptr[aj[j]]; /* block col. index */
593: rtmp_ptr = rtmp + vj*4;
594: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
595: }
596: }
598: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
599: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
600: i = jl[k]; /* first row to be added to k_th row */
602: while (i < mbs){
603: nexti = jl[i]; /* next row to be added to k_th row */
605: /* compute multiplier */
606: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
608: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
609: diag = ba + i*4;
610: u = ba + ili*4;
611: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
612: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
613: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
614: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
615:
616: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
617: dk[0] += uik[0]*u[0] + uik[1]*u[1];
618: dk[1] += uik[2]*u[0] + uik[3]*u[1];
619: dk[2] += uik[0]*u[2] + uik[1]*u[3];
620: dk[3] += uik[2]*u[2] + uik[3]*u[3];
622: /* update -U(i,k): ba[ili] = uik */
623: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
625: /* add multiple of row i to k-th row ... */
626: jmin = ili + 1; jmax = bi[i+1];
627: if (jmin < jmax){
628: for (j=jmin; j<jmax; j++) {
629: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
630: rtmp_ptr = rtmp + bj[j]*4;
631: u = ba + j*4;
632: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
633: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
634: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
635: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
636: }
637:
638: /* ... add i to row list for next nonzero entry */
639: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
640: j = bj[jmin];
641: jl[i] = jl[j]; jl[j] = i; /* update jl */
642: }
643: i = nexti;
644: }
646: /* save nonzero entries in k-th row of U ... */
648: /* invert diagonal block */
649: diag = ba+k*4;
650: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
651: Kernel_A_gets_inverse_A_2(diag);
652:
653: jmin = bi[k]; jmax = bi[k+1];
654: if (jmin < jmax) {
655: for (j=jmin; j<jmax; j++){
656: vj = bj[j]; /* block col. index of U */
657: u = ba + j*4;
658: rtmp_ptr = rtmp + vj*4;
659: for (k1=0; k1<4; k1++){
660: *u++ = *rtmp_ptr;
661: *rtmp_ptr++ = 0.0;
662: }
663: }
664:
665: /* ... add k to row list for first nonzero entry in k-th row */
666: il[k] = jmin;
667: i = bj[jmin];
668: jl[k] = jl[i]; jl[i] = k;
669: }
670: }
672: PetscFree(rtmp);
673: PetscFree(il);
674: PetscFree(dk);
675: if (a->permute) {
676: PetscFree(aa);
677: }
678: ISRestoreIndices(perm,&perm_ptr);
679: C->factor = FACTOR_CHOLESKY;
680: C->assembled = PETSC_TRUE;
681: C->preallocated = PETSC_TRUE;
682: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
683: return(0);
684: }
686: /*
687: Version for when blocks are 2 by 2 Using natural ordering
688: */
689: int MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
690: {
691: Mat C = *B;
692: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
693: int ierr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
694: int *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
695: MatScalar *ba = b->a,*aa,*ap,*dk,*uik;
696: MatScalar *u,*diag,*rtmp,*rtmp_ptr;
699:
700: /* initialization */
701: /* il and jl record the first nonzero element in each row of the accessing
702: window U(0:k, k:mbs-1).
703: jl: list of rows to be added to uneliminated rows
704: i>= k: jl(i) is the first row to be added to row i
705: i< k: jl(i) is the row following row i in some list of rows
706: jl(i) = mbs indicates the end of a list
707: il(i): points to the first nonzero element in columns k,...,mbs-1 of
708: row i of U */
709: PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
710: PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
711: PetscMalloc(2*mbs*sizeof(int),&il);
712: jl = il + mbs;
713: for (i=0; i<mbs; i++) {
714: jl[i] = mbs; il[0] = 0;
715: }
716: PetscMalloc(8*sizeof(MatScalar),&dk);
717: uik = dk + 4;
718:
719: ai = a->i; aj = a->j; aa = a->a;
721: /* for each row k */
722: for (k = 0; k<mbs; k++){
724: /*initialize k-th row with elements nonzero in row k of A */
725: jmin = ai[k]; jmax = ai[k+1];
726: if (jmin < jmax) {
727: ap = aa + jmin*4;
728: for (j = jmin; j < jmax; j++){
729: vj = aj[j]; /* block col. index */
730: rtmp_ptr = rtmp + vj*4;
731: for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
732: }
733: }
735: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
736: PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
737: i = jl[k]; /* first row to be added to k_th row */
739: while (i < mbs){
740: nexti = jl[i]; /* next row to be added to k_th row */
742: /* compute multiplier */
743: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
745: /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
746: diag = ba + i*4;
747: u = ba + ili*4;
748: uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
749: uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
750: uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
751: uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
752:
753: /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
754: dk[0] += uik[0]*u[0] + uik[1]*u[1];
755: dk[1] += uik[2]*u[0] + uik[3]*u[1];
756: dk[2] += uik[0]*u[2] + uik[1]*u[3];
757: dk[3] += uik[2]*u[2] + uik[3]*u[3];
759: /* update -U(i,k): ba[ili] = uik */
760: PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));
762: /* add multiple of row i to k-th row ... */
763: jmin = ili + 1; jmax = bi[i+1];
764: if (jmin < jmax){
765: for (j=jmin; j<jmax; j++) {
766: /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
767: rtmp_ptr = rtmp + bj[j]*4;
768: u = ba + j*4;
769: rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
770: rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
771: rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
772: rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
773: }
774:
775: /* ... add i to row list for next nonzero entry */
776: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
777: j = bj[jmin];
778: jl[i] = jl[j]; jl[j] = i; /* update jl */
779: }
780: i = nexti;
781: }
783: /* save nonzero entries in k-th row of U ... */
785: /* invert diagonal block */
786: diag = ba+k*4;
787: PetscMemcpy(diag,dk,4*sizeof(MatScalar));
788: Kernel_A_gets_inverse_A_2(diag);
789:
790: jmin = bi[k]; jmax = bi[k+1];
791: if (jmin < jmax) {
792: for (j=jmin; j<jmax; j++){
793: vj = bj[j]; /* block col. index of U */
794: u = ba + j*4;
795: rtmp_ptr = rtmp + vj*4;
796: for (k1=0; k1<4; k1++){
797: *u++ = *rtmp_ptr;
798: *rtmp_ptr++ = 0.0;
799: }
800: }
801:
802: /* ... add k to row list for first nonzero entry in k-th row */
803: il[k] = jmin;
804: i = bj[jmin];
805: jl[k] = jl[i]; jl[i] = k;
806: }
807: }
809: PetscFree(rtmp);
810: PetscFree(il);
811: PetscFree(dk);
813: C->factor = FACTOR_CHOLESKY;
814: C->assembled = PETSC_TRUE;
815: C->preallocated = PETSC_TRUE;
816: PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
817: return(0);
818: }
820: /*
821: Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
822: Version for blocks are 1 by 1.
823: */
824: int MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
825: {
826: Mat C = *B;
827: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
828: IS ip = b->row;
829: int *rip,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
830: int *ai,*aj,*r;
831: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
832: MatScalar *rtmp;
833: MatScalar *ba = b->a,*aa,ak;
834: MatScalar dk,uikdi;
837: ierr = ISGetIndices(ip,&rip);
838: if (!a->permute){
839: ai = a->i; aj = a->j; aa = a->a;
840: } else {
841: ai = a->inew; aj = a->jnew;
842: PetscMalloc(ai[mbs]*sizeof(MatScalar),&aa);
843: PetscMemcpy(aa,a->a,ai[mbs]*sizeof(MatScalar));
844: PetscMalloc(ai[mbs]*sizeof(int),&r);
845: ierr= PetscMemcpy(r,a->a2anew,(ai[mbs])*sizeof(int));
847: jmin = ai[0]; jmax = ai[mbs];
848: for (j=jmin; j<jmax; j++){
849: while (r[j] != j){
850: k = r[j]; r[j] = r[k]; r[k] = k;
851: ak = aa[k]; aa[k] = aa[j]; aa[j] = ak;
852: }
853: }
854: PetscFree(r);
855: }
856:
857: /* initialization */
858: /* il and jl record the first nonzero element in each row of the accessing
859: window U(0:k, k:mbs-1).
860: jl: list of rows to be added to uneliminated rows
861: i>= k: jl(i) is the first row to be added to row i
862: i< k: jl(i) is the row following row i in some list of rows
863: jl(i) = mbs indicates the end of a list
864: il(i): points to the first nonzero element in columns k,...,mbs-1 of
865: row i of U */
866: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
867: PetscMalloc(2*mbs*sizeof(int),&il);
868: jl = il + mbs;
869: for (i=0; i<mbs; i++) {
870: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
871: }
873: /* for each row k */
874: for (k = 0; k<mbs; k++){
876: /*initialize k-th row with elements nonzero in row perm(k) of A */
877: jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
878: if (jmin < jmax) {
879: for (j = jmin; j < jmax; j++){
880: vj = rip[aj[j]];
881: rtmp[vj] = aa[j];
882: }
883: }
885: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
886: dk = rtmp[k];
887: i = jl[k]; /* first row to be added to k_th row */
889: while (i < mbs){
890: nexti = jl[i]; /* next row to be added to k_th row */
892: /* compute multiplier, update D(k) and U(i,k) */
893: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
894: uikdi = - ba[ili]*ba[i];
895: dk += uikdi*ba[ili];
896: ba[ili] = uikdi; /* -U(i,k) */
898: /* add multiple of row i to k-th row ... */
899: jmin = ili + 1; jmax = bi[i+1];
900: if (jmin < jmax){
901: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
902: /* ... add i to row list for next nonzero entry */
903: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
904: j = bj[jmin];
905: jl[i] = jl[j]; jl[j] = i; /* update jl */
906: }
907: i = nexti;
908: }
910: /* check for zero pivot and save diagoanl element */
911: if (dk == 0.0){
912: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
913: /*
914: }else if (PetscRealPart(dk) < 0){
915: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
916: */
917: }
919: /* save nonzero entries in k-th row of U ... */
920: ba[k] = 1.0/dk;
921: jmin = bi[k]; jmax = bi[k+1];
922: if (jmin < jmax) {
923: for (j=jmin; j<jmax; j++){
924: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
925: }
926: /* ... add k to row list for first nonzero entry in k-th row */
927: il[k] = jmin;
928: i = bj[jmin];
929: jl[k] = jl[i]; jl[i] = k;
930: }
931: }
932:
933: PetscFree(rtmp);
934: PetscFree(il);
935: if (a->permute){
936: PetscFree(aa);
937: }
939: ISRestoreIndices(ip,&rip);
940: C->factor = FACTOR_CHOLESKY;
941: C->assembled = PETSC_TRUE;
942: C->preallocated = PETSC_TRUE;
943: PetscLogFlops(b->mbs);
944: return(0);
945: }
947: /*
948: Version for when blocks are 1 by 1 Using natural ordering
949: */
950: int MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat *B)
951: {
952: Mat C = *B;
953: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
954: int ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
955: int *ai,*aj;
956: int k,jmin,jmax,*jl,*il,vj,nexti,ili;
957: MatScalar *rtmp,*ba = b->a,*aa,dk,uikdi;
960:
961: /* initialization */
962: /* il and jl record the first nonzero element in each row of the accessing
963: window U(0:k, k:mbs-1).
964: jl: list of rows to be added to uneliminated rows
965: i>= k: jl(i) is the first row to be added to row i
966: i< k: jl(i) is the row following row i in some list of rows
967: jl(i) = mbs indicates the end of a list
968: il(i): points to the first nonzero element in columns k,...,mbs-1 of
969: row i of U */
970: PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
971: PetscMalloc(2*mbs*sizeof(int),&il);
972: jl = il + mbs;
973: for (i=0; i<mbs; i++) {
974: rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
975: }
977: ai = a->i; aj = a->j; aa = a->a;
979: /* for each row k */
980: for (k = 0; k<mbs; k++){
982: /*initialize k-th row with elements nonzero in row perm(k) of A */
983: jmin = ai[k]; jmax = ai[k+1];
984: if (jmin < jmax) {
985: for (j = jmin; j < jmax; j++){
986: vj = aj[j];
987: rtmp[vj] = aa[j];
988: }
989: }
991: /* modify k-th row by adding in those rows i with U(i,k) != 0 */
992: dk = rtmp[k];
993: i = jl[k]; /* first row to be added to k_th row */
995: while (i < mbs){
996: nexti = jl[i]; /* next row to be added to k_th row */
998: /* compute multiplier, update D(k) and U(i,k) */
999: ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
1000: uikdi = - ba[ili]*ba[i];
1001: dk += uikdi*ba[ili];
1002: ba[ili] = uikdi; /* -U(i,k) */
1004: /* add multiple of row i to k-th row ... */
1005: jmin = ili + 1; jmax = bi[i+1];
1006: if (jmin < jmax){
1007: for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1008: /* ... add i to row list for next nonzero entry */
1009: il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */
1010: j = bj[jmin];
1011: jl[i] = jl[j]; jl[j] = i; /* update jl */
1012: }
1013: i = nexti;
1014: }
1016: /* check for zero pivot and save diagoanl element */
1017: if (dk == 0.0){
1018: SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot");
1019: /*
1020: }else if (PetscRealPart(dk) < 0){
1021: PetscPrintf(PETSC_COMM_SELF,"Negative pivot: d[%d] = %gn",k,dk);
1022: */
1023: }
1025: /* save nonzero entries in k-th row of U ... */
1026: ba[k] = 1.0/dk;
1027: jmin = bi[k]; jmax = bi[k+1];
1028: if (jmin < jmax) {
1029: for (j=jmin; j<jmax; j++){
1030: vj = bj[j]; ba[j] = rtmp[vj]; rtmp[vj] = 0.0;
1031: }
1032: /* ... add k to row list for first nonzero entry in k-th row */
1033: il[k] = jmin;
1034: i = bj[jmin];
1035: jl[k] = jl[i]; jl[i] = k;
1036: }
1037: }
1038:
1039: PetscFree(rtmp);
1040: PetscFree(il);
1041:
1042: C->factor = FACTOR_CHOLESKY;
1043: C->assembled = PETSC_TRUE;
1044: C->preallocated = PETSC_TRUE;
1045: PetscLogFlops(b->mbs);
1046: return(0);
1047: }
1049: int MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,PetscReal f)
1050: {
1052: Mat C;
1055: MatCholeskyFactorSymbolic(A,perm,f,&C);
1056: MatCholeskyFactorNumeric(A,&C);
1057: MatHeaderCopy(A,C);
1058: return(0);
1059: }