Actual source code: sbaijfact2.c
1: /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include src/mat/impls/sbaij/seq/sbaij.h
7: #include src/mat/impls/baij/seq/baij.h
8: #include src/vec/vecimpl.h
9: #include src/inline/ilu.h
10: #include src/inline/dot.h
14: int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
15: {
17: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
18: }
22: int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
23: {
25: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
26: }
30: int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
31: {
33: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
34: }
38: int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
39: {
41: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
42: }
46: int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
47: {
49: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
50: }
54: int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
55: {
57: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
58: }
62: int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
63: {
65: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
66: }
70: int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
71: {
73: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
74: }
78: int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
79: {
81: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
82: }
86: int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
87: {
89: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
90: }
94: int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
95: {
97: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
98: }
102: int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
103: {
105: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
106: }
110: int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
111: {
113: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
114: }
118: int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
119: {
121: SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
122: }
126: int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
127: {
128: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
129: IS isrow=a->row;
130: int mbs=a->mbs,*ai=a->i,*aj=a->j;
131: int nz,*vj,k,ierr,*r,idx,k1;
132: int bs=a->bs,bs2 = a->bs2;
133: MatScalar *aa=a->a,*v,*diag;
134: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
137: VecGetArray(bb,&b);
138: VecGetArray(xx,&x);
139: t = a->solve_work;
140: ISGetIndices(isrow,&r);
141: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
143: /* solve U^T * D * y = b by forward substitution */
144: xk = t;
145: for (k=0; k<mbs; k++) { /* t <- perm(b) */
146: idx = bs*r[k];
147: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
148: }
149: for (k=0; k<mbs; k++){
150: v = aa + bs2*ai[k];
151: xk = t + k*bs; /* Dk*xk = k-th block of x */
152: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
153: nz = ai[k+1] - ai[k];
154: vj = aj + ai[k];
155: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
156: while (nz--) {
157: /* x(:) += U(k,:)^T*(Dk*xk) */
158: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
159: vj++; xj = t + (*vj)*bs;
160: v += bs2;
161: }
162: /* xk = inv(Dk)*(Dk*xk) */
163: diag = aa+k*bs2; /* ptr to inv(Dk) */
164: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
165: }
167: /* solve U*x = y by back substitution */
168: for (k=mbs-1; k>=0; k--){
169: v = aa + bs2*ai[k];
170: xk = t + k*bs; /* xk */
171: nz = ai[k+1] - ai[k];
172: vj = aj + ai[k];
173: xj = t + (*vj)*bs;
174: while (nz--) {
175: /* xk += U(k,:)*x(:) */
176: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
177: vj++;
178: v += bs2; xj = t + (*vj)*bs;
179: }
180: idx = bs*r[k];
181: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
182: }
184: PetscFree(xk_tmp);
185: VecRestoreArray(bb,&b);
186: VecRestoreArray(xx,&x);
187: PetscLogFlops(bs2*(2*a->s_nz + mbs));
188: return(0);
189: }
193: int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
194: {
195: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
196: int mbs=a->mbs,*ai=a->i,*aj=a->j;
197: int nz,*vj,k,ierr;
198: int bs=a->bs,bs2 = a->bs2;
199: MatScalar *aa=a->a,*v,*diag;
200: PetscScalar *x,*xk,*xj,*b,*xk_tmp;
203:
204: VecGetArray(bb,&b);
205: VecGetArray(xx,&x);
207: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
209: /* solve U^T * D * y = b by forward substitution */
210: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
211: for (k=0; k<mbs; k++){
212: v = aa + bs2*ai[k];
213: xk = x + k*bs; /* Dk*xk = k-th block of x */
214: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
215: nz = ai[k+1] - ai[k];
216: vj = aj + ai[k];
217: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
218: while (nz--) {
219: /* x(:) += U(k,:)^T*(Dk*xk) */
220: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
221: vj++; xj = x + (*vj)*bs;
222: v += bs2;
223: }
224: /* xk = inv(Dk)*(Dk*xk) */
225: diag = aa+k*bs2; /* ptr to inv(Dk) */
226: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
227: }
229: /* solve U*x = y by back substitution */
230: for (k=mbs-1; k>=0; k--){
231: v = aa + bs2*ai[k];
232: xk = x + k*bs; /* xk */
233: nz = ai[k+1] - ai[k];
234: vj = aj + ai[k];
235: xj = x + (*vj)*bs;
236: while (nz--) {
237: /* xk += U(k,:)*x(:) */
238: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
239: vj++;
240: v += bs2; xj = x + (*vj)*bs;
241: }
242: }
244: PetscFree(xk_tmp);
245: VecRestoreArray(bb,&b);
246: VecRestoreArray(xx,&x);
247: PetscLogFlops(bs2*(2*a->s_nz + mbs));
248: return(0);
249: }
253: int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
254: {
255: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
256: IS isrow=a->row;
257: int mbs=a->mbs,*ai=a->i,*aj=a->j;
258: int nz,*vj,k,ierr,*r,idx;
259: MatScalar *aa=a->a,*v,*d;
260: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
263: VecGetArray(bb,&b);
264: VecGetArray(xx,&x);
265: t = a->solve_work;
266: ISGetIndices(isrow,&r);
268: /* solve U^T * D * y = b by forward substitution */
269: tp = t;
270: for (k=0; k<mbs; k++) { /* t <- perm(b) */
271: idx = 7*r[k];
272: tp[0] = b[idx];
273: tp[1] = b[idx+1];
274: tp[2] = b[idx+2];
275: tp[3] = b[idx+3];
276: tp[4] = b[idx+4];
277: tp[5] = b[idx+5];
278: tp[6] = b[idx+6];
279: tp += 7;
280: }
281:
282: for (k=0; k<mbs; k++){
283: v = aa + 49*ai[k];
284: vj = aj + ai[k];
285: tp = t + k*7;
286: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
287: nz = ai[k+1] - ai[k];
288: tp = t + (*vj)*7;
289: while (nz--) {
290: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
291: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
292: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
293: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
294: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
295: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
296: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
297: vj++; tp = t + (*vj)*7;
298: v += 49;
299: }
301: /* xk = inv(Dk)*(Dk*xk) */
302: d = aa+k*49; /* ptr to inv(Dk) */
303: tp = t + k*7;
304: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
305: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
306: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
307: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
308: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
309: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
310: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
311: }
313: /* solve U*x = y by back substitution */
314: for (k=mbs-1; k>=0; k--){
315: v = aa + 49*ai[k];
316: vj = aj + ai[k];
317: tp = t + k*7;
318: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
319: nz = ai[k+1] - ai[k];
320:
321: tp = t + (*vj)*7;
322: while (nz--) {
323: /* xk += U(k,:)*x(:) */
324: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
325: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
326: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
327: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
328: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
329: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
330: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
331: vj++; tp = t + (*vj)*7;
332: v += 49;
333: }
334: tp = t + k*7;
335: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
336: idx = 7*r[k];
337: x[idx] = x0;
338: x[idx+1] = x1;
339: x[idx+2] = x2;
340: x[idx+3] = x3;
341: x[idx+4] = x4;
342: x[idx+5] = x5;
343: x[idx+6] = x6;
344: }
346: VecRestoreArray(bb,&b);
347: VecRestoreArray(xx,&x);
348: PetscLogFlops(49*(2*a->s_nz + mbs));
349: return(0);
350: }
354: int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
355: {
356: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
357: int mbs=a->mbs,*ai=a->i,*aj=a->j;
358: MatScalar *aa=a->a,*v,*d;
359: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
360: int nz,*vj,k,ierr;
363:
364: VecGetArray(bb,&b);
365: VecGetArray(xx,&x);
366:
367: /* solve U^T * D * y = b by forward substitution */
368: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
369: for (k=0; k<mbs; k++){
370: v = aa + 49*ai[k];
371: xp = x + k*7;
372: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
373: nz = ai[k+1] - ai[k];
374: vj = aj + ai[k];
375: xp = x + (*vj)*7;
376: while (nz--) {
377: /* x(:) += U(k,:)^T*(Dk*xk) */
378: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
379: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
380: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
381: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
382: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
383: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
384: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
385: vj++; xp = x + (*vj)*7;
386: v += 49;
387: }
388: /* xk = inv(Dk)*(Dk*xk) */
389: d = aa+k*49; /* ptr to inv(Dk) */
390: xp = x + k*7;
391: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
392: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
393: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
394: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
395: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
396: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
397: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
398: }
400: /* solve U*x = y by back substitution */
401: for (k=mbs-1; k>=0; k--){
402: v = aa + 49*ai[k];
403: xp = x + k*7;
404: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
405: nz = ai[k+1] - ai[k];
406: vj = aj + ai[k];
407: xp = x + (*vj)*7;
408: while (nz--) {
409: /* xk += U(k,:)*x(:) */
410: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
411: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
412: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
413: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
414: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
415: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
416: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
417: vj++;
418: v += 49; xp = x + (*vj)*7;
419: }
420: xp = x + k*7;
421: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
422: }
424: VecRestoreArray(bb,&b);
425: VecRestoreArray(xx,&x);
426: PetscLogFlops(49*(2*a->s_nz + mbs));
427: return(0);
428: }
432: int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
433: {
434: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
435: IS isrow=a->row;
436: int mbs=a->mbs,*ai=a->i,*aj=a->j;
437: int nz,*vj,k,ierr,*r,idx;
438: MatScalar *aa=a->a,*v,*d;
439: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
442: VecGetArray(bb,&b);
443: VecGetArray(xx,&x);
444: t = a->solve_work;
445: ISGetIndices(isrow,&r);
447: /* solve U^T * D * y = b by forward substitution */
448: tp = t;
449: for (k=0; k<mbs; k++) { /* t <- perm(b) */
450: idx = 6*r[k];
451: tp[0] = b[idx];
452: tp[1] = b[idx+1];
453: tp[2] = b[idx+2];
454: tp[3] = b[idx+3];
455: tp[4] = b[idx+4];
456: tp[5] = b[idx+5];
457: tp += 6;
458: }
459:
460: for (k=0; k<mbs; k++){
461: v = aa + 36*ai[k];
462: vj = aj + ai[k];
463: tp = t + k*6;
464: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
465: nz = ai[k+1] - ai[k];
466: tp = t + (*vj)*6;
467: while (nz--) {
468: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
469: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
470: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
471: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
472: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
473: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
474: vj++; tp = t + (*vj)*6;
475: v += 36;
476: }
478: /* xk = inv(Dk)*(Dk*xk) */
479: d = aa+k*36; /* ptr to inv(Dk) */
480: tp = t + k*6;
481: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
482: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
483: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
484: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
485: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
486: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
487: }
489: /* solve U*x = y by back substitution */
490: for (k=mbs-1; k>=0; k--){
491: v = aa + 36*ai[k];
492: vj = aj + ai[k];
493: tp = t + k*6;
494: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
495: nz = ai[k+1] - ai[k];
496:
497: tp = t + (*vj)*6;
498: while (nz--) {
499: /* xk += U(k,:)*x(:) */
500: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
501: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
502: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
503: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
504: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
505: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
506: vj++; tp = t + (*vj)*6;
507: v += 36;
508: }
509: tp = t + k*6;
510: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
511: idx = 6*r[k];
512: x[idx] = x0;
513: x[idx+1] = x1;
514: x[idx+2] = x2;
515: x[idx+3] = x3;
516: x[idx+4] = x4;
517: x[idx+5] = x5;
518: }
520: VecRestoreArray(bb,&b);
521: VecRestoreArray(xx,&x);
522: PetscLogFlops(36*(2*a->s_nz + mbs));
523: return(0);
524: }
528: int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
529: {
530: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
531: int mbs=a->mbs,*ai=a->i,*aj=a->j;
532: MatScalar *aa=a->a,*v,*d;
533: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5;
534: int nz,*vj,k,ierr;
537:
538: VecGetArray(bb,&b);
539: VecGetArray(xx,&x);
540:
541: /* solve U^T * D * y = b by forward substitution */
542: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
543: for (k=0; k<mbs; k++){
544: v = aa + 36*ai[k];
545: xp = x + k*6;
546: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
547: nz = ai[k+1] - ai[k];
548: vj = aj + ai[k];
549: xp = x + (*vj)*6;
550: while (nz--) {
551: /* x(:) += U(k,:)^T*(Dk*xk) */
552: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
553: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
554: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
555: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
556: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
557: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
558: vj++; xp = x + (*vj)*6;
559: v += 36;
560: }
561: /* xk = inv(Dk)*(Dk*xk) */
562: d = aa+k*36; /* ptr to inv(Dk) */
563: xp = x + k*6;
564: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
565: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
566: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
567: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
568: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
569: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
570: }
572: /* solve U*x = y by back substitution */
573: for (k=mbs-1; k>=0; k--){
574: v = aa + 36*ai[k];
575: xp = x + k*6;
576: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
577: nz = ai[k+1] - ai[k];
578: vj = aj + ai[k];
579: xp = x + (*vj)*6;
580: while (nz--) {
581: /* xk += U(k,:)*x(:) */
582: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
583: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
584: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
585: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
586: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
587: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
588: vj++;
589: v += 36; xp = x + (*vj)*6;
590: }
591: xp = x + k*6;
592: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
593: }
595: VecRestoreArray(bb,&b);
596: VecRestoreArray(xx,&x);
597: PetscLogFlops(36*(2*a->s_nz + mbs));
598: return(0);
599: }
603: int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
604: {
605: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
606: IS isrow=a->row;
607: int mbs=a->mbs,*ai=a->i,*aj=a->j;
608: int nz,*vj,k,ierr,*r,idx;
609: MatScalar *aa=a->a,*v,*diag;
610: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
613: VecGetArray(bb,&b);
614: VecGetArray(xx,&x);
615: t = a->solve_work;
616: ISGetIndices(isrow,&r);
618: /* solve U^T * D * y = b by forward substitution */
619: tp = t;
620: for (k=0; k<mbs; k++) { /* t <- perm(b) */
621: idx = 5*r[k];
622: tp[0] = b[idx];
623: tp[1] = b[idx+1];
624: tp[2] = b[idx+2];
625: tp[3] = b[idx+3];
626: tp[4] = b[idx+4];
627: tp += 5;
628: }
629:
630: for (k=0; k<mbs; k++){
631: v = aa + 25*ai[k];
632: vj = aj + ai[k];
633: tp = t + k*5;
634: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
635: nz = ai[k+1] - ai[k];
637: tp = t + (*vj)*5;
638: while (nz--) {
639: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
640: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
641: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
642: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
643: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
644: vj++; tp = t + (*vj)*5;
645: v += 25;
646: }
648: /* xk = inv(Dk)*(Dk*xk) */
649: diag = aa+k*25; /* ptr to inv(Dk) */
650: tp = t + k*5;
651: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
652: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
653: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
654: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
655: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
656: }
658: /* solve U*x = y by back substitution */
659: for (k=mbs-1; k>=0; k--){
660: v = aa + 25*ai[k];
661: vj = aj + ai[k];
662: tp = t + k*5;
663: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
664: nz = ai[k+1] - ai[k];
665:
666: tp = t + (*vj)*5;
667: while (nz--) {
668: /* xk += U(k,:)*x(:) */
669: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
670: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
671: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
672: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
673: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
674: vj++; tp = t + (*vj)*5;
675: v += 25;
676: }
677: tp = t + k*5;
678: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
679: idx = 5*r[k];
680: x[idx] = x0;
681: x[idx+1] = x1;
682: x[idx+2] = x2;
683: x[idx+3] = x3;
684: x[idx+4] = x4;
685: }
687: VecRestoreArray(bb,&b);
688: VecRestoreArray(xx,&x);
689: PetscLogFlops(25*(2*a->s_nz + mbs));
690: return(0);
691: }
695: int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
696: {
697: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
698: int mbs=a->mbs,*ai=a->i,*aj=a->j;
699: MatScalar *aa=a->a,*v,*diag;
700: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4;
701: int nz,*vj,k,ierr;
704:
705: VecGetArray(bb,&b);
706: VecGetArray(xx,&x);
708: /* solve U^T * D * y = b by forward substitution */
709: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
710: for (k=0; k<mbs; k++){
711: v = aa + 25*ai[k];
712: xp = x + k*5;
713: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
714: nz = ai[k+1] - ai[k];
715: vj = aj + ai[k];
716: xp = x + (*vj)*5;
717: while (nz--) {
718: /* x(:) += U(k,:)^T*(Dk*xk) */
719: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
720: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
721: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
722: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
723: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
724: vj++; xp = x + (*vj)*5;
725: v += 25;
726: }
727: /* xk = inv(Dk)*(Dk*xk) */
728: diag = aa+k*25; /* ptr to inv(Dk) */
729: xp = x + k*5;
730: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
731: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
732: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
733: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
734: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
735: }
737: /* solve U*x = y by back substitution */
738: for (k=mbs-1; k>=0; k--){
739: v = aa + 25*ai[k];
740: xp = x + k*5;
741: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
742: nz = ai[k+1] - ai[k];
743: vj = aj + ai[k];
744: xp = x + (*vj)*5;
745: while (nz--) {
746: /* xk += U(k,:)*x(:) */
747: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
748: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
749: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
750: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
751: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
752: vj++;
753: v += 25; xp = x + (*vj)*5;
754: }
755: xp = x + k*5;
756: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
757: }
759: VecRestoreArray(bb,&b);
760: VecRestoreArray(xx,&x);
761: PetscLogFlops(25*(2*a->s_nz + mbs));
762: return(0);
763: }
767: int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
768: {
769: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
770: IS isrow=a->row;
771: int mbs=a->mbs,*ai=a->i,*aj=a->j;
772: int nz,*vj,k,ierr,*r,idx;
773: MatScalar *aa=a->a,*v,*diag;
774: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
777: VecGetArray(bb,&b);
778: VecGetArray(xx,&x);
779: t = a->solve_work;
780: ISGetIndices(isrow,&r);
782: /* solve U^T * D * y = b by forward substitution */
783: tp = t;
784: for (k=0; k<mbs; k++) { /* t <- perm(b) */
785: idx = 4*r[k];
786: tp[0] = b[idx];
787: tp[1] = b[idx+1];
788: tp[2] = b[idx+2];
789: tp[3] = b[idx+3];
790: tp += 4;
791: }
792:
793: for (k=0; k<mbs; k++){
794: v = aa + 16*ai[k];
795: vj = aj + ai[k];
796: tp = t + k*4;
797: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
798: nz = ai[k+1] - ai[k];
800: tp = t + (*vj)*4;
801: while (nz--) {
802: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
803: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
804: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
805: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
806: vj++; tp = t + (*vj)*4;
807: v += 16;
808: }
810: /* xk = inv(Dk)*(Dk*xk) */
811: diag = aa+k*16; /* ptr to inv(Dk) */
812: tp = t + k*4;
813: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
814: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
815: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
816: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
817: }
819: /* solve U*x = y by back substitution */
820: for (k=mbs-1; k>=0; k--){
821: v = aa + 16*ai[k];
822: vj = aj + ai[k];
823: tp = t + k*4;
824: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
825: nz = ai[k+1] - ai[k];
826:
827: tp = t + (*vj)*4;
828: while (nz--) {
829: /* xk += U(k,:)*x(:) */
830: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
831: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
832: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
833: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
834: vj++; tp = t + (*vj)*4;
835: v += 16;
836: }
837: tp = t + k*4;
838: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
839: idx = 4*r[k];
840: x[idx] = x0;
841: x[idx+1] = x1;
842: x[idx+2] = x2;
843: x[idx+3] = x3;
844: }
846: VecRestoreArray(bb,&b);
847: VecRestoreArray(xx,&x);
848: PetscLogFlops(16*(2*a->s_nz + mbs));
849: return(0);
850: }
852: /*
853: Special case where the matrix was factored in the natural ordering.
854: This eliminates the need for the column and row permutation.
855: */
858: int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
859: {
860: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
861: int mbs=a->mbs,*ai=a->i,*aj=a->j;
862: MatScalar *aa=a->a,*v,*diag;
863: PetscScalar *x,*xp,*b,x0,x1,x2,x3;
864: int nz,*vj,k,ierr;
867:
868: VecGetArray(bb,&b);
869: VecGetArray(xx,&x);
871: /* solve U^T * D * y = b by forward substitution */
872: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
873: for (k=0; k<mbs; k++){
874: v = aa + 16*ai[k];
875: xp = x + k*4;
876: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
877: nz = ai[k+1] - ai[k];
878: vj = aj + ai[k];
879: xp = x + (*vj)*4;
880: while (nz--) {
881: /* x(:) += U(k,:)^T*(Dk*xk) */
882: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
883: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
884: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
885: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
886: vj++; xp = x + (*vj)*4;
887: v += 16;
888: }
889: /* xk = inv(Dk)*(Dk*xk) */
890: diag = aa+k*16; /* ptr to inv(Dk) */
891: xp = x + k*4;
892: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
893: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
894: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
895: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
896: }
898: /* solve U*x = y by back substitution */
899: for (k=mbs-1; k>=0; k--){
900: v = aa + 16*ai[k];
901: xp = x + k*4;
902: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
903: nz = ai[k+1] - ai[k];
904: vj = aj + ai[k];
905: xp = x + (*vj)*4;
906: while (nz--) {
907: /* xk += U(k,:)*x(:) */
908: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
909: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
910: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
911: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
912: vj++;
913: v += 16; xp = x + (*vj)*4;
914: }
915: xp = x + k*4;
916: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
917: }
919: VecRestoreArray(bb,&b);
920: VecRestoreArray(xx,&x);
921: PetscLogFlops(16*(2*a->s_nz + mbs));
922: return(0);
923: }
927: int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
928: {
929: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
930: IS isrow=a->row;
931: int mbs=a->mbs,*ai=a->i,*aj=a->j;
932: int nz,*vj,k,ierr,*r,idx;
933: MatScalar *aa=a->a,*v,*diag;
934: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
937: VecGetArray(bb,&b);
938: VecGetArray(xx,&x);
939: t = a->solve_work;
940: ISGetIndices(isrow,&r);
942: /* solve U^T * D * y = b by forward substitution */
943: tp = t;
944: for (k=0; k<mbs; k++) { /* t <- perm(b) */
945: idx = 3*r[k];
946: tp[0] = b[idx];
947: tp[1] = b[idx+1];
948: tp[2] = b[idx+2];
949: tp += 3;
950: }
951:
952: for (k=0; k<mbs; k++){
953: v = aa + 9*ai[k];
954: vj = aj + ai[k];
955: tp = t + k*3;
956: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
957: nz = ai[k+1] - ai[k];
959: tp = t + (*vj)*3;
960: while (nz--) {
961: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
962: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
963: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
964: vj++; tp = t + (*vj)*3;
965: v += 9;
966: }
968: /* xk = inv(Dk)*(Dk*xk) */
969: diag = aa+k*9; /* ptr to inv(Dk) */
970: tp = t + k*3;
971: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
972: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
973: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
974: }
976: /* solve U*x = y by back substitution */
977: for (k=mbs-1; k>=0; k--){
978: v = aa + 9*ai[k];
979: vj = aj + ai[k];
980: tp = t + k*3;
981: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
982: nz = ai[k+1] - ai[k];
983:
984: tp = t + (*vj)*3;
985: while (nz--) {
986: /* xk += U(k,:)*x(:) */
987: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
988: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
989: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
990: vj++; tp = t + (*vj)*3;
991: v += 9;
992: }
993: tp = t + k*3;
994: tp[0] = x0; tp[1] = x1; tp[2] = x2;
995: idx = 3*r[k];
996: x[idx] = x0;
997: x[idx+1] = x1;
998: x[idx+2] = x2;
999: }
1001: VecRestoreArray(bb,&b);
1002: VecRestoreArray(xx,&x);
1003: PetscLogFlops(9*(2*a->s_nz + mbs));
1004: return(0);
1005: }
1007: /*
1008: Special case where the matrix was factored in the natural ordering.
1009: This eliminates the need for the column and row permutation.
1010: */
1013: int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1014: {
1015: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1016: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1017: MatScalar *aa=a->a,*v,*diag;
1018: PetscScalar *x,*xp,*b,x0,x1,x2;
1019: int nz,*vj,k,ierr;
1022:
1023: VecGetArray(bb,&b);
1024: VecGetArray(xx,&x);
1026: /* solve U^T * D * y = b by forward substitution */
1027: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1028: for (k=0; k<mbs; k++){
1029: v = aa + 9*ai[k];
1030: xp = x + k*3;
1031: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1032: nz = ai[k+1] - ai[k];
1033: vj = aj + ai[k];
1034: xp = x + (*vj)*3;
1035: while (nz--) {
1036: /* x(:) += U(k,:)^T*(Dk*xk) */
1037: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1038: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1039: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1040: vj++; xp = x + (*vj)*3;
1041: v += 9;
1042: }
1043: /* xk = inv(Dk)*(Dk*xk) */
1044: diag = aa+k*9; /* ptr to inv(Dk) */
1045: xp = x + k*3;
1046: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1047: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1048: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1049: }
1051: /* solve U*x = y by back substitution */
1052: for (k=mbs-1; k>=0; k--){
1053: v = aa + 9*ai[k];
1054: xp = x + k*3;
1055: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1056: nz = ai[k+1] - ai[k];
1057: vj = aj + ai[k];
1058: xp = x + (*vj)*3;
1059: while (nz--) {
1060: /* xk += U(k,:)*x(:) */
1061: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1062: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1063: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1064: vj++;
1065: v += 9; xp = x + (*vj)*3;
1066: }
1067: xp = x + k*3;
1068: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1069: }
1071: VecRestoreArray(bb,&b);
1072: VecRestoreArray(xx,&x);
1073: PetscLogFlops(9*(2*a->s_nz + mbs));
1074: return(0);
1075: }
1079: int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1080: {
1081: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
1082: IS isrow=a->row;
1083: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1084: int nz,*vj,k,k2,ierr,*r,idx;
1085: MatScalar *aa=a->a,*v,*diag;
1086: PetscScalar *x,*b,x0,x1,*t;
1089: VecGetArray(bb,&b);
1090: VecGetArray(xx,&x);
1091: t = a->solve_work;
1092: /* printf("called MatSolve_SeqSBAIJ_2\n"); */
1093: ISGetIndices(isrow,&r);
1095: /* solve U^T * D * y = perm(b) by forward substitution */
1096: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1097: idx = 2*r[k];
1098: t[k*2] = b[idx];
1099: t[k*2+1] = b[idx+1];
1100: }
1101: for (k=0; k<mbs; k++){
1102: v = aa + 4*ai[k];
1103: vj = aj + ai[k];
1104: k2 = k*2;
1105: x0 = t[k2]; x1 = t[k2+1];
1106: nz = ai[k+1] - ai[k];
1107: while (nz--) {
1108: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1109: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1110: vj++; v += 4;
1111: }
1112: diag = aa+k*4; /* ptr to inv(Dk) */
1113: t[k2] = diag[0]*x0 + diag[2]*x1;
1114: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1115: }
1117: /* solve U*x = y by back substitution */
1118: for (k=mbs-1; k>=0; k--){
1119: v = aa + 4*ai[k];
1120: vj = aj + ai[k];
1121: k2 = k*2;
1122: x0 = t[k2]; x1 = t[k2+1];
1123: nz = ai[k+1] - ai[k];
1124: while (nz--) {
1125: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1126: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1127: vj++; v += 4;
1128: }
1129: t[k2] = x0;
1130: t[k2+1] = x1;
1131: idx = 2*r[k];
1132: x[idx] = x0;
1133: x[idx+1] = x1;
1134: }
1136: ISRestoreIndices(isrow,&r);
1137: VecRestoreArray(bb,&b);
1138: VecRestoreArray(xx,&x);
1139: PetscLogFlops(4*(2*a->s_nz + mbs));
1140: return(0);
1141: }
1143: /*
1144: Special case where the matrix was factored in the natural ordering.
1145: This eliminates the need for the column and row permutation.
1146: */
1149: int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1150: {
1151: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1152: int mbs=a->mbs,*ai=a->i,*aj=a->j;
1153: MatScalar *aa=a->a,*v,*diag;
1154: PetscScalar *x,*b,x0,x1;
1155: int nz,*vj,k,k2,ierr;
1158:
1159: VecGetArray(bb,&b);
1160: VecGetArray(xx,&x);
1162: /* solve U^T * D * y = b by forward substitution */
1163: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1164: for (k=0; k<mbs; k++){
1165: v = aa + 4*ai[k];
1166: vj = aj + ai[k];
1167: k2 = k*2;
1168: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1169: nz = ai[k+1] - ai[k];
1170:
1171: while (nz--) {
1172: /* x(:) += U(k,:)^T*(Dk*xk) */
1173: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1174: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1175: vj++; v += 4;
1176: }
1177: /* xk = inv(Dk)*(Dk*xk) */
1178: diag = aa+k*4; /* ptr to inv(Dk) */
1179: x[k2] = diag[0]*x0 + diag[2]*x1;
1180: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1181: }
1183: /* solve U*x = y by back substitution */
1184: for (k=mbs-1; k>=0; k--){
1185: v = aa + 4*ai[k];
1186: vj = aj + ai[k];
1187: k2 = k*2;
1188: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1189: nz = ai[k+1] - ai[k];
1190: while (nz--) {
1191: /* xk += U(k,:)*x(:) */
1192: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1193: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1194: vj++; v += 4;
1195: }
1196: x[k2] = x0;
1197: x[k2+1] = x1;
1198: }
1200: VecRestoreArray(bb,&b);
1201: VecRestoreArray(xx,&x);
1202: PetscLogFlops(4*(2*a->s_nz + mbs)); /* bs2*(2*a->s_nz + mbs) */
1203: return(0);
1204: }
1208: int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1209: {
1210: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1211: IS isrow=a->row;
1212: int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1213: MatScalar *aa=a->a,*v;
1214: PetscScalar *x,*b,xk,*t;
1215: int nz,*vj,k;
1218: if (!mbs) return(0);
1220: VecGetArray(bb,&b);
1221: VecGetArray(xx,&x);
1222: t = a->solve_work;
1224: ISGetIndices(isrow,&rip);
1225:
1226: /* solve U^T*D*y = perm(b) by forward substitution */
1227: for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1228: for (k=0; k<mbs; k++){
1229: v = aa + ai[k];
1230: vj = aj + ai[k];
1231: xk = t[k];
1232: nz = ai[k+1] - ai[k];
1233: while (nz--) t[*vj++] += (*v++) * xk;
1234: t[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */
1235: }
1237: /* solve U*x = y by back substitution */
1238: for (k=mbs-1; k>=0; k--){
1239: v = aa + ai[k];
1240: vj = aj + ai[k];
1241: xk = t[k];
1242: nz = ai[k+1] - ai[k];
1243: while (nz--) xk += (*v++) * t[*vj++];
1244: t[k] = xk;
1245: x[rip[k]] = xk;
1246: }
1248: ISRestoreIndices(isrow,&rip);
1249: VecRestoreArray(bb,&b);
1250: VecRestoreArray(xx,&x);
1251: PetscLogFlops(4*a->s_nz + A->m);
1252: return(0);
1253: }
1257: int MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1258: {
1259: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1260: int ierr;
1263: if (a->bs == 1) {
1264: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1265: } else {
1266: IS isrow=a->row;
1267: int mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1268: MatScalar *aa=a->a,*v;
1269: PetscScalar *x,*b,*t;
1270: int nz,*vj,k,n;
1271: if (bb->n > a->solves_work_n) {
1272: if (a->solves_work) {PetscFree(a->solves_work);}
1273: PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1274: a->solves_work_n = bb->n;
1275: }
1276: n = bb->n;
1277: VecGetArray(bb->v,&b);
1278: VecGetArray(xx->v,&x);
1279: t = a->solves_work;
1281: ISGetIndices(isrow,&rip);
1282:
1283: /* solve U^T*D*y = perm(b) by forward substitution */
1284: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1285: for (k=0; k<mbs; k++){
1286: v = aa + ai[k];
1287: vj = aj + ai[k];
1288: nz = ai[k+1] - ai[k];
1289: while (nz--) {
1290: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1291: v++;vj++;
1292: }
1293: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1294: }
1295:
1296: /* solve U*x = y by back substitution */
1297: for (k=mbs-1; k>=0; k--){
1298: v = aa + ai[k];
1299: vj = aj + ai[k];
1300: nz = ai[k+1] - ai[k];
1301: while (nz--) {
1302: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1303: v++;vj++;
1304: }
1305: for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1306: }
1308: ISRestoreIndices(isrow,&rip);
1309: VecRestoreArray(bb->v,&b);
1310: VecRestoreArray(xx->v,&x);
1311: PetscLogFlops(bb->n*(4*a->s_nz + A->m));
1312: }
1313: return(0);
1314: }
1316: /*
1317: Special case where the matrix was ILU(0) factored in the natural
1318: ordering. This eliminates the need for the column and row permutation.
1319: */
1322: int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1323: {
1324: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1325: int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1326: MatScalar *aa=a->a,*v;
1327: PetscScalar *x,*b,xk;
1328: int nz,*vj,k;
1331: VecGetArray(bb,&b);
1332: VecGetArray(xx,&x);
1333:
1334: /* solve U^T*D*y = b by forward substitution */
1335: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1336: for (k=0; k<mbs; k++){
1337: v = aa + ai[k] + 1;
1338: vj = aj + ai[k] + 1;
1339: xk = x[k];
1340: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1341: while (nz--) x[*vj++] += (*v++) * xk;
1342: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1343: }
1345: /* solve U*x = y by back substitution */
1346: for (k=mbs-2; k>=0; k--){
1347: v = aa + ai[k] + 1;
1348: vj = aj + ai[k] + 1;
1349: xk = x[k];
1350: nz = ai[k+1] - ai[k] - 1;
1351: while (nz--) xk += (*v++) * x[*vj++];
1352: x[k] = xk;
1353: }
1355: VecRestoreArray(bb,&b);
1356: VecRestoreArray(xx,&x);
1357: PetscLogFlops(4*a->s_nz + A->m);
1358: return(0);
1359: }
1361: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1364: int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1365: {
1366: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1367: int *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1368: int *jutmp,bs = a->bs,bs2=a->bs2;
1369: int m,realloc = 0,*levtmp;
1370: int *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1371: int incrlev,*lev,shift,prow,nz;
1372: int *il,ili,nextprow;
1373: PetscReal f = info->fill,levels = info->levels;
1374: PetscTruth perm_identity;
1377: /* check whether perm is the identity mapping */
1378: ISIdentity(perm,&perm_identity);
1380: /* special case that simply copies fill pattern */
1381: if (!levels && perm_identity && bs==1) {
1382: MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1383: (*B)->factor = FACTOR_CHOLESKY;
1384: b = (Mat_SeqSBAIJ*)(*B)->data;
1385: b->row = perm;
1386: b->icol = perm;
1387: b->factor_damping = info->damping;
1388: b->factor_shift = info->shift;
1389: b->factor_zeropivot = info->zeropivot;
1390: PetscObjectReference((PetscObject)perm);
1391: PetscObjectReference((PetscObject)perm);
1392: PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1393: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1394: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1395: return(0);
1396: }
1398: /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1399: if (levels > 0 && perm_identity && bs==1 ){
1400: if (!perm_identity) a->permute = PETSC_TRUE;
1401:
1402: ISGetIndices(perm,&rip);
1403:
1404: if (perm_identity){ /* without permutation */
1405: ai = a->i; aj = a->j;
1406: } else { /* non-trivial permutation */
1407: MatReorderingSeqSBAIJ(A,perm);
1408: ai = a->inew; aj = a->jnew;
1409: }
1410:
1411: /* initialization */
1412: PetscMalloc((mbs+1)*sizeof(int),&iu);
1413: umax = (int)(f*ai[mbs] + 1);
1414: PetscMalloc(umax*sizeof(int),&lev);
1415: PetscMalloc(umax*sizeof(int),&ju);
1416: iu[0] = 0;
1417: juidx = 0; /* index for ju */
1418: PetscMalloc((4*mbs+1)*sizeof(int),&jl); /* linked list for getting pivot row */
1419: q = jl + mbs; /* linked list for col index of active row */
1420: levtmp = q + mbs;
1421: il = levtmp + mbs;
1422: for (i=0; i<mbs; i++){
1423: jl[i] = mbs;
1424: q[i] = 0;
1425: il[i] = 0;
1426: }
1428: /* for each row k */
1429: for (k=0; k<mbs; k++){
1430: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1431: q[k] = mbs;
1432: /* initialize nonzero structure of k-th row to row rip[k] of A */
1433: jmin = ai[rip[k]] +1; /* exclude diag[k] */
1434: jmax = ai[rip[k]+1];
1435: for (j=jmin; j<jmax; j++){
1436: vj = rip[aj[j]]; /* col. value */
1437: if(vj > k){
1438: qm = k;
1439: do {
1440: m = qm; qm = q[m];
1441: } while(qm < vj);
1442: if (qm == vj) {
1443: SETERRQ(1," error: duplicate entry in A\n");
1444: }
1445: nzk++;
1446: q[m] = vj;
1447: q[vj] = qm;
1448: levtmp[vj] = 0; /* initialize lev for nonzero element */
1449: } /* if(vj > k) */
1450: } /* for (j=jmin; j<jmax; j++) */
1452: /* modify nonzero structure of k-th row by computing fill-in
1453: for each row i to be merged in */
1454: prow = k;
1455: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1456:
1457: while (prow < k){
1458: nextprow = jl[prow];
1459: /* merge row prow into k-th row */
1460: ili = il[prow];
1461: jmin = ili + 1; /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1462: jmax = iu[prow+1];
1463: qm = k;
1464: for (j=jmin; j<jmax; j++){
1465: vj = ju[j];
1466: incrlev = lev[j] + 1;
1467: if (incrlev > levels) continue;
1468: do {
1469: m = qm; qm = q[m];
1470: } while (qm < vj);
1471: if (qm != vj){ /* a fill */
1472: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1473: levtmp[vj] = incrlev;
1474: } else {
1475: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1476: }
1477: }
1478: if (jmin < jmax){ /* update il and jl */
1479: il[prow] = jmin;
1480: j = ju[jmin];
1481: jl[prow] = jl[j]; jl[j] = prow;
1482: }
1483: prow = nextprow;
1484: }
1485:
1486: /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1487: if (nzk > 0){
1488: i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1489: jl[k] = jl[i]; jl[i] = k;
1490: il[k] = iu[k] + 1;
1491: }
1492: iu[k+1] = iu[k] + nzk + 1; /* include diag[k] */
1494: /* allocate more space to ju if needed */
1495: if (iu[k+1] > umax) {
1496: /* estimate how much additional space we will need */
1497: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1498: /* just double the memory each time */
1499: maxadd = umax;
1500: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1501: umax += maxadd;
1503: /* allocate a longer ju */
1504: PetscMalloc(umax*sizeof(int),&jutmp);
1505: PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1506: PetscFree(ju);
1507: ju = jutmp;
1509: PetscMalloc(umax*sizeof(int),&jutmp);
1510: PetscMemcpy(jutmp,lev,(iu[k])*sizeof(int));
1511: PetscFree(lev);
1512: lev = jutmp;
1513: realloc += 2; /* count how many times we realloc */
1514: }
1516: /* save nonzero structure of k-th row in ju */
1517: ju[juidx] = k; /* diag[k] */
1518: lev[juidx] = 0;
1519: juidx++;
1520: i = k;
1521: while (nzk --) {
1522: i = q[i];
1523: ju[juidx] = i;
1524: lev[juidx] = levtmp[i];
1525: juidx++;
1526: }
1527: } /* end of for (k=0; k<mbs; k++) */
1529: if (ai[mbs] != 0) {
1530: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1531: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1532: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1533: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1534: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1535: } else {
1536: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1537: }
1539: ISRestoreIndices(perm,&rip);
1540: PetscFree(jl);
1541: PetscFree(lev);
1543: /* put together the new matrix */
1544: MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
1545: /* PetscLogObjectParent(*B,iperm); */
1546: b = (Mat_SeqSBAIJ*)(*B)->data;
1547: PetscFree(b->imax);
1548: b->singlemalloc = PETSC_FALSE;
1549: /* the next line frees the default space generated by the Create() */
1550: PetscFree(b->a);
1551: PetscFree(b->ilen);
1552: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1553: b->j = ju;
1554: b->i = iu;
1555: b->diag = 0;
1556: b->ilen = 0;
1557: b->imax = 0;
1558: b->row = perm;
1559: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1560: PetscObjectReference((PetscObject)perm);
1561: b->icol = perm;
1562: PetscObjectReference((PetscObject)perm);
1563: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1564: /* In b structure: Free imax, ilen, old a, old j.
1565: Allocate idnew, solve_work, new a, new j */
1566: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1567: b->s_maxnz = b->s_nz = iu[mbs];
1568: b->factor_damping = info->damping;
1569: b->factor_shift = info->shift;
1570: b->factor_zeropivot = info->zeropivot;
1572: (*B)->factor = FACTOR_CHOLESKY;
1573: (*B)->info.factor_mallocs = realloc;
1574: (*B)->info.fill_ratio_given = f;
1575: if (ai[mbs] != 0) {
1576: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1577: } else {
1578: (*B)->info.fill_ratio_needed = 0.0;
1579: }
1582: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1583: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1584: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1585:
1586: return(0);
1587: } /* end of if (levels > 0 && perm_identity && bs==1 ) */
1589: if (!perm_identity) a->permute = PETSC_TRUE;
1590: if (perm_identity){
1591: ai = a->i; aj = a->j;
1592: } else { /* non-trivial permutation */
1593: MatReorderingSeqSBAIJ(A, perm);
1594: ai = a->inew; aj = a->jnew;
1595: }
1596:
1597: /* initialization */
1598: ISGetIndices(perm,&rip);
1599: umax = (int)(f*ai[mbs] + 1);
1600: PetscMalloc(umax*sizeof(int),&lev);
1601: umax += mbs + 1;
1602: shift = mbs + 1;
1603: PetscMalloc((mbs+1)*sizeof(int),&iu);
1604: PetscMalloc(umax*sizeof(int),&ju);
1605: iu[0] = mbs + 1;
1606: juidx = mbs + 1;
1607: /* prowl: linked list for pivot row */
1608: PetscMalloc((3*mbs+1)*sizeof(int),&prowl);
1609: /* q: linked list for col index */
1610: q = prowl + mbs;
1611: levtmp = q + mbs;
1612:
1613: for (i=0; i<mbs; i++){
1614: prowl[i] = mbs;
1615: q[i] = 0;
1616: }
1618: /* for each row k */
1619: for (k=0; k<mbs; k++){
1620: nzk = 0;
1621: q[k] = mbs;
1622: /* copy current row into linked list */
1623: nz = ai[rip[k]+1] - ai[rip[k]];
1624: j = ai[rip[k]];
1625: while (nz--){
1626: vj = rip[aj[j++]];
1627: if (vj > k){
1628: qm = k;
1629: do {
1630: m = qm; qm = q[m];
1631: } while(qm < vj);
1632: if (qm == vj) {
1633: SETERRQ(1," error: duplicate entry in A\n");
1634: }
1635: nzk++;
1636: q[m] = vj;
1637: q[vj] = qm;
1638: levtmp[vj] = 0; /* initialize lev for nonzero element */
1639: }
1640: }
1642: /* modify nonzero structure of k-th row by computing fill-in
1643: for each row prow to be merged in */
1644: prow = k;
1645: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1646:
1647: while (prow < k){
1648: /* merge row prow into k-th row */
1649: jmin = iu[prow] + 1;
1650: jmax = iu[prow+1];
1651: qm = k;
1652: for (j=jmin; j<jmax; j++){
1653: incrlev = lev[j-shift] + 1;
1654: if (incrlev > levels) continue;
1656: vj = ju[j];
1657: do {
1658: m = qm; qm = q[m];
1659: } while (qm < vj);
1660: if (qm != vj){ /* a fill */
1661: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1662: levtmp[vj] = incrlev;
1663: } else {
1664: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1665: }
1666: }
1667: prow = prowl[prow]; /* next pivot row */
1668: }
1669:
1670: /* add k to row list for first nonzero element in k-th row */
1671: if (nzk > 1){
1672: i = q[k]; /* col value of first nonzero element in k_th row of U */
1673: prowl[k] = prowl[i]; prowl[i] = k;
1674: }
1675: iu[k+1] = iu[k] + nzk;
1677: /* allocate more space to ju and lev if needed */
1678: if (iu[k+1] > umax) {
1679: /* estimate how much additional space we will need */
1680: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1681: /* just double the memory each time */
1682: maxadd = umax;
1683: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1684: umax += maxadd;
1686: /* allocate a longer ju */
1687: PetscMalloc(umax*sizeof(int),&jutmp);
1688: PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1689: PetscFree(ju);
1690: ju = jutmp;
1692: PetscMalloc(umax*sizeof(int),&jutmp);
1693: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));
1694: PetscFree(lev);
1695: lev = jutmp;
1696: realloc += 2; /* count how many times we realloc */
1697: }
1699: /* save nonzero structure of k-th row in ju */
1700: i=k;
1701: while (nzk --) {
1702: i = q[i];
1703: ju[juidx] = i;
1704: lev[juidx-shift] = levtmp[i];
1705: juidx++;
1706: }
1707: }
1708:
1709: if (ai[mbs] != 0) {
1710: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1711: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1712: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1713: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1714: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1715: } else {
1716: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1717: }
1719: ISRestoreIndices(perm,&rip);
1720: PetscFree(prowl);
1721: PetscFree(lev);
1723: /* put together the new matrix */
1724: MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
1725: /* PetscLogObjectParent(*B,iperm); */
1726: b = (Mat_SeqSBAIJ*)(*B)->data;
1727: PetscFree(b->imax);
1728: b->singlemalloc = PETSC_FALSE;
1729: /* the next line frees the default space generated by the Create() */
1730: PetscFree(b->a);
1731: PetscFree(b->ilen);
1732: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1733: b->j = ju;
1734: b->i = iu;
1735: b->diag = 0;
1736: b->ilen = 0;
1737: b->imax = 0;
1738:
1739: if (b->row) {
1740: ISDestroy(b->row);
1741: }
1742: if (b->icol) {
1743: ISDestroy(b->icol);
1744: }
1745: b->row = perm;
1746: b->icol = perm;
1747: PetscObjectReference((PetscObject)perm);
1748: PetscObjectReference((PetscObject)perm);
1749: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1750: /* In b structure: Free imax, ilen, old a, old j.
1751: Allocate idnew, solve_work, new a, new j */
1752: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1753: b->s_maxnz = b->s_nz = iu[mbs];
1754:
1755: (*B)->factor = FACTOR_CHOLESKY;
1756: (*B)->info.factor_mallocs = realloc;
1757: (*B)->info.fill_ratio_given = f;
1758: if (ai[mbs] != 0) {
1759: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1760: } else {
1761: (*B)->info.fill_ratio_needed = 0.0;
1762: }
1764: if (perm_identity){
1765: switch (bs) {
1766: case 1:
1767: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1768: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1769: (*B)->ops->solves = MatSolves_SeqSBAIJ_1;
1770: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1771: break;
1772: case 2:
1773: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1774: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1775: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1776: break;
1777: case 3:
1778: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1779: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1780: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1781: break;
1782: case 4:
1783: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1784: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1785: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1786: break;
1787: case 5:
1788: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1789: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1790: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1791: break;
1792: case 6:
1793: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1794: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1795: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1796: break;
1797: case 7:
1798: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1799: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1800: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1801: break;
1802: default:
1803: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1804: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1805: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1806: break;
1807: }
1808: }
1810: return(0);
1811: }