Actual source code: sbaijfact2.c
1: /*
2: Factorization code for SBAIJ format.
3: */
5: #include src/mat/impls/sbaij/seq/sbaij.h
6: #include src/mat/impls/baij/seq/baij.h
7: #include src/inline/ilu.h
8: #include src/inline/dot.h
12: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
13: {
14: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
15: IS isrow=a->row;
16: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
17: PetscErrorCode ierr;
18: PetscInt nz,*vj,k,*r,idx,k1;
19: PetscInt bs=A->bs,bs2 = a->bs2;
20: MatScalar *aa=a->a,*v,*diag;
21: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
24: VecGetArray(bb,&b);
25: VecGetArray(xx,&x);
26: t = a->solve_work;
27: ISGetIndices(isrow,&r);
28: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
30: /* solve U^T * D * y = b by forward substitution */
31: xk = t;
32: for (k=0; k<mbs; k++) { /* t <- perm(b) */
33: idx = bs*r[k];
34: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35: }
36: for (k=0; k<mbs; k++){
37: v = aa + bs2*ai[k];
38: xk = t + k*bs; /* Dk*xk = k-th block of x */
39: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
40: nz = ai[k+1] - ai[k];
41: vj = aj + ai[k];
42: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43: while (nz--) {
44: /* x(:) += U(k,:)^T*(Dk*xk) */
45: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46: vj++; xj = t + (*vj)*bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa+k*bs2; /* ptr to inv(Dk) */
51: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k=mbs-1; k>=0; k--){
56: v = aa + bs2*ai[k];
57: xk = t + k*bs; /* xk */
58: nz = ai[k+1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj)*bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2; xj = t + (*vj)*bs;
66: }
67: idx = bs*r[k];
68: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow,&r);
73: VecRestoreArray(bb,&b);
74: VecRestoreArray(xx,&x);
75: PetscLogFlops(bs2*(2*a->nz + mbs));
76: return(0);
77: }
81: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
82: {
83: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
85: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
86: PetscInt nz,*vj,k;
87: PetscInt bs=A->bs,bs2 = a->bs2;
88: MatScalar *aa=a->a,*v,*diag;
89: PetscScalar *x,*xk,*xj,*b,*xk_tmp;
92:
93: VecGetArray(bb,&b);
94: VecGetArray(xx,&x);
96: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
98: /* solve U^T * D * y = b by forward substitution */
99: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
100: for (k=0; k<mbs; k++){
101: v = aa + bs2*ai[k];
102: xk = x + k*bs; /* Dk*xk = k-th block of x */
103: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
104: nz = ai[k+1] - ai[k];
105: vj = aj + ai[k];
106: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
107: while (nz--) {
108: /* x(:) += U(k,:)^T*(Dk*xk) */
109: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
110: vj++; xj = x + (*vj)*bs;
111: v += bs2;
112: }
113: /* xk = inv(Dk)*(Dk*xk) */
114: diag = aa+k*bs2; /* ptr to inv(Dk) */
115: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
116: }
118: /* solve U*x = y by back substitution */
119: for (k=mbs-1; k>=0; k--){
120: v = aa + bs2*ai[k];
121: xk = x + k*bs; /* xk */
122: nz = ai[k+1] - ai[k];
123: vj = aj + ai[k];
124: xj = x + (*vj)*bs;
125: while (nz--) {
126: /* xk += U(k,:)*x(:) */
127: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
128: vj++;
129: v += bs2; xj = x + (*vj)*bs;
130: }
131: }
133: PetscFree(xk_tmp);
134: VecRestoreArray(bb,&b);
135: VecRestoreArray(xx,&x);
136: PetscLogFlops(bs2*(2*a->nz + mbs));
137: return(0);
138: }
142: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
143: {
144: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
145: IS isrow=a->row;
146: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
148: PetscInt nz,*vj,k,*r,idx;
149: MatScalar *aa=a->a,*v,*d;
150: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
153: VecGetArray(bb,&b);
154: VecGetArray(xx,&x);
155: t = a->solve_work;
156: ISGetIndices(isrow,&r);
158: /* solve U^T * D * y = b by forward substitution */
159: tp = t;
160: for (k=0; k<mbs; k++) { /* t <- perm(b) */
161: idx = 7*r[k];
162: tp[0] = b[idx];
163: tp[1] = b[idx+1];
164: tp[2] = b[idx+2];
165: tp[3] = b[idx+3];
166: tp[4] = b[idx+4];
167: tp[5] = b[idx+5];
168: tp[6] = b[idx+6];
169: tp += 7;
170: }
171:
172: for (k=0; k<mbs; k++){
173: v = aa + 49*ai[k];
174: vj = aj + ai[k];
175: tp = t + k*7;
176: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
177: nz = ai[k+1] - ai[k];
178: tp = t + (*vj)*7;
179: while (nz--) {
180: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
181: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
182: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
183: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
184: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
185: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
186: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
187: vj++; tp = t + (*vj)*7;
188: v += 49;
189: }
191: /* xk = inv(Dk)*(Dk*xk) */
192: d = aa+k*49; /* ptr to inv(Dk) */
193: tp = t + k*7;
194: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
195: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
196: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
197: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
198: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
199: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
200: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
201: }
203: /* solve U*x = y by back substitution */
204: for (k=mbs-1; k>=0; k--){
205: v = aa + 49*ai[k];
206: vj = aj + ai[k];
207: tp = t + k*7;
208: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
209: nz = ai[k+1] - ai[k];
210:
211: tp = t + (*vj)*7;
212: while (nz--) {
213: /* xk += U(k,:)*x(:) */
214: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
215: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
216: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
217: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
218: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
219: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
220: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
221: vj++; tp = t + (*vj)*7;
222: v += 49;
223: }
224: tp = t + k*7;
225: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
226: idx = 7*r[k];
227: x[idx] = x0;
228: x[idx+1] = x1;
229: x[idx+2] = x2;
230: x[idx+3] = x3;
231: x[idx+4] = x4;
232: x[idx+5] = x5;
233: x[idx+6] = x6;
234: }
236: ISRestoreIndices(isrow,&r);
237: VecRestoreArray(bb,&b);
238: VecRestoreArray(xx,&x);
239: PetscLogFlops(49*(2*a->nz + mbs));
240: return(0);
241: }
245: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
246: {
247: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
249: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
250: MatScalar *aa=a->a,*v,*d;
251: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
252: PetscInt nz,*vj,k;
255: VecGetArray(bb,&b);
256: VecGetArray(xx,&x);
257:
258: /* solve U^T * D * y = b by forward substitution */
259: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
260: for (k=0; k<mbs; k++){
261: v = aa + 49*ai[k];
262: xp = x + k*7;
263: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
264: nz = ai[k+1] - ai[k];
265: vj = aj + ai[k];
266: xp = x + (*vj)*7;
267: while (nz--) {
268: /* x(:) += U(k,:)^T*(Dk*xk) */
269: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
270: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
271: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
272: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
273: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
274: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
275: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
276: vj++; xp = x + (*vj)*7;
277: v += 49;
278: }
279: /* xk = inv(Dk)*(Dk*xk) */
280: d = aa+k*49; /* ptr to inv(Dk) */
281: xp = x + k*7;
282: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
283: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
284: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
285: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
286: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
287: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
288: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
289: }
291: /* solve U*x = y by back substitution */
292: for (k=mbs-1; k>=0; k--){
293: v = aa + 49*ai[k];
294: xp = x + k*7;
295: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
296: nz = ai[k+1] - ai[k];
297: vj = aj + ai[k];
298: xp = x + (*vj)*7;
299: while (nz--) {
300: /* xk += U(k,:)*x(:) */
301: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
302: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
303: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
304: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
305: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
306: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
307: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
308: vj++;
309: v += 49; xp = x + (*vj)*7;
310: }
311: xp = x + k*7;
312: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
313: }
315: VecRestoreArray(bb,&b);
316: VecRestoreArray(xx,&x);
317: PetscLogFlops(49*(2*a->nz + mbs));
318: return(0);
319: }
323: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
324: {
325: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
326: IS isrow=a->row;
327: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
329: PetscInt nz,*vj,k,*r,idx;
330: MatScalar *aa=a->a,*v,*d;
331: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
334: VecGetArray(bb,&b);
335: VecGetArray(xx,&x);
336: t = a->solve_work;
337: ISGetIndices(isrow,&r);
339: /* solve U^T * D * y = b by forward substitution */
340: tp = t;
341: for (k=0; k<mbs; k++) { /* t <- perm(b) */
342: idx = 6*r[k];
343: tp[0] = b[idx];
344: tp[1] = b[idx+1];
345: tp[2] = b[idx+2];
346: tp[3] = b[idx+3];
347: tp[4] = b[idx+4];
348: tp[5] = b[idx+5];
349: tp += 6;
350: }
351:
352: for (k=0; k<mbs; k++){
353: v = aa + 36*ai[k];
354: vj = aj + ai[k];
355: tp = t + k*6;
356: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
357: nz = ai[k+1] - ai[k];
358: tp = t + (*vj)*6;
359: while (nz--) {
360: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
361: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
362: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
363: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
364: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
365: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
366: vj++; tp = t + (*vj)*6;
367: v += 36;
368: }
370: /* xk = inv(Dk)*(Dk*xk) */
371: d = aa+k*36; /* ptr to inv(Dk) */
372: tp = t + k*6;
373: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
374: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
375: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
376: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
377: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
378: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
379: }
381: /* solve U*x = y by back substitution */
382: for (k=mbs-1; k>=0; k--){
383: v = aa + 36*ai[k];
384: vj = aj + ai[k];
385: tp = t + k*6;
386: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
387: nz = ai[k+1] - ai[k];
388:
389: tp = t + (*vj)*6;
390: while (nz--) {
391: /* xk += U(k,:)*x(:) */
392: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
393: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
394: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
395: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
396: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
397: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
398: vj++; tp = t + (*vj)*6;
399: v += 36;
400: }
401: tp = t + k*6;
402: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
403: idx = 6*r[k];
404: x[idx] = x0;
405: x[idx+1] = x1;
406: x[idx+2] = x2;
407: x[idx+3] = x3;
408: x[idx+4] = x4;
409: x[idx+5] = x5;
410: }
412: ISRestoreIndices(isrow,&r);
413: VecRestoreArray(bb,&b);
414: VecRestoreArray(xx,&x);
415: PetscLogFlops(36*(2*a->nz + mbs));
416: return(0);
417: }
421: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
422: {
423: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
424: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
425: MatScalar *aa=a->a,*v,*d;
426: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5;
428: PetscInt nz,*vj,k;
431:
432: VecGetArray(bb,&b);
433: VecGetArray(xx,&x);
434:
435: /* solve U^T * D * y = b by forward substitution */
436: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
437: for (k=0; k<mbs; k++){
438: v = aa + 36*ai[k];
439: xp = x + k*6;
440: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
441: nz = ai[k+1] - ai[k];
442: vj = aj + ai[k];
443: xp = x + (*vj)*6;
444: while (nz--) {
445: /* x(:) += U(k,:)^T*(Dk*xk) */
446: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
447: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
448: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
449: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
450: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
451: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
452: vj++; xp = x + (*vj)*6;
453: v += 36;
454: }
455: /* xk = inv(Dk)*(Dk*xk) */
456: d = aa+k*36; /* ptr to inv(Dk) */
457: xp = x + k*6;
458: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
459: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
460: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
461: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
462: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
463: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
464: }
466: /* solve U*x = y by back substitution */
467: for (k=mbs-1; k>=0; k--){
468: v = aa + 36*ai[k];
469: xp = x + k*6;
470: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
471: nz = ai[k+1] - ai[k];
472: vj = aj + ai[k];
473: xp = x + (*vj)*6;
474: while (nz--) {
475: /* xk += U(k,:)*x(:) */
476: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
477: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
478: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
479: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
480: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
481: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
482: vj++;
483: v += 36; xp = x + (*vj)*6;
484: }
485: xp = x + k*6;
486: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
487: }
489: VecRestoreArray(bb,&b);
490: VecRestoreArray(xx,&x);
491: PetscLogFlops(36*(2*a->nz + mbs));
492: return(0);
493: }
497: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
498: {
499: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
500: IS isrow=a->row;
501: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
503: PetscInt nz,*vj,k,*r,idx;
504: MatScalar *aa=a->a,*v,*diag;
505: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
508: VecGetArray(bb,&b);
509: VecGetArray(xx,&x);
510: t = a->solve_work;
511: ISGetIndices(isrow,&r);
513: /* solve U^T * D * y = b by forward substitution */
514: tp = t;
515: for (k=0; k<mbs; k++) { /* t <- perm(b) */
516: idx = 5*r[k];
517: tp[0] = b[idx];
518: tp[1] = b[idx+1];
519: tp[2] = b[idx+2];
520: tp[3] = b[idx+3];
521: tp[4] = b[idx+4];
522: tp += 5;
523: }
524:
525: for (k=0; k<mbs; k++){
526: v = aa + 25*ai[k];
527: vj = aj + ai[k];
528: tp = t + k*5;
529: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
530: nz = ai[k+1] - ai[k];
532: tp = t + (*vj)*5;
533: while (nz--) {
534: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
535: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
536: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
537: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
538: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
539: vj++; tp = t + (*vj)*5;
540: v += 25;
541: }
543: /* xk = inv(Dk)*(Dk*xk) */
544: diag = aa+k*25; /* ptr to inv(Dk) */
545: tp = t + k*5;
546: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
547: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
548: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
549: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
550: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
551: }
553: /* solve U*x = y by back substitution */
554: for (k=mbs-1; k>=0; k--){
555: v = aa + 25*ai[k];
556: vj = aj + ai[k];
557: tp = t + k*5;
558: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
559: nz = ai[k+1] - ai[k];
560:
561: tp = t + (*vj)*5;
562: while (nz--) {
563: /* xk += U(k,:)*x(:) */
564: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
565: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
566: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
567: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
568: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
569: vj++; tp = t + (*vj)*5;
570: v += 25;
571: }
572: tp = t + k*5;
573: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
574: idx = 5*r[k];
575: x[idx] = x0;
576: x[idx+1] = x1;
577: x[idx+2] = x2;
578: x[idx+3] = x3;
579: x[idx+4] = x4;
580: }
582: ISRestoreIndices(isrow,&r);
583: VecRestoreArray(bb,&b);
584: VecRestoreArray(xx,&x);
585: PetscLogFlops(25*(2*a->nz + mbs));
586: return(0);
587: }
591: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
592: {
593: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
594: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
595: MatScalar *aa=a->a,*v,*diag;
596: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4;
598: PetscInt nz,*vj,k;
601:
602: VecGetArray(bb,&b);
603: VecGetArray(xx,&x);
605: /* solve U^T * D * y = b by forward substitution */
606: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
607: for (k=0; k<mbs; k++){
608: v = aa + 25*ai[k];
609: xp = x + k*5;
610: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
611: nz = ai[k+1] - ai[k];
612: vj = aj + ai[k];
613: xp = x + (*vj)*5;
614: while (nz--) {
615: /* x(:) += U(k,:)^T*(Dk*xk) */
616: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
617: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
618: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
619: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
620: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
621: vj++; xp = x + (*vj)*5;
622: v += 25;
623: }
624: /* xk = inv(Dk)*(Dk*xk) */
625: diag = aa+k*25; /* ptr to inv(Dk) */
626: xp = x + k*5;
627: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
628: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
629: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
630: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
631: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
632: }
634: /* solve U*x = y by back substitution */
635: for (k=mbs-1; k>=0; k--){
636: v = aa + 25*ai[k];
637: xp = x + k*5;
638: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
639: nz = ai[k+1] - ai[k];
640: vj = aj + ai[k];
641: xp = x + (*vj)*5;
642: while (nz--) {
643: /* xk += U(k,:)*x(:) */
644: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
645: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
646: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
647: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
648: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
649: vj++;
650: v += 25; xp = x + (*vj)*5;
651: }
652: xp = x + k*5;
653: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
654: }
656: VecRestoreArray(bb,&b);
657: VecRestoreArray(xx,&x);
658: PetscLogFlops(25*(2*a->nz + mbs));
659: return(0);
660: }
664: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
665: {
666: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
667: IS isrow=a->row;
668: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
670: PetscInt nz,*vj,k,*r,idx;
671: MatScalar *aa=a->a,*v,*diag;
672: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
675: VecGetArray(bb,&b);
676: VecGetArray(xx,&x);
677: t = a->solve_work;
678: ISGetIndices(isrow,&r);
680: /* solve U^T * D * y = b by forward substitution */
681: tp = t;
682: for (k=0; k<mbs; k++) { /* t <- perm(b) */
683: idx = 4*r[k];
684: tp[0] = b[idx];
685: tp[1] = b[idx+1];
686: tp[2] = b[idx+2];
687: tp[3] = b[idx+3];
688: tp += 4;
689: }
690:
691: for (k=0; k<mbs; k++){
692: v = aa + 16*ai[k];
693: vj = aj + ai[k];
694: tp = t + k*4;
695: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
696: nz = ai[k+1] - ai[k];
698: tp = t + (*vj)*4;
699: while (nz--) {
700: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
701: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
702: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
703: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
704: vj++; tp = t + (*vj)*4;
705: v += 16;
706: }
708: /* xk = inv(Dk)*(Dk*xk) */
709: diag = aa+k*16; /* ptr to inv(Dk) */
710: tp = t + k*4;
711: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
712: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
713: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
714: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
715: }
717: /* solve U*x = y by back substitution */
718: for (k=mbs-1; k>=0; k--){
719: v = aa + 16*ai[k];
720: vj = aj + ai[k];
721: tp = t + k*4;
722: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
723: nz = ai[k+1] - ai[k];
724:
725: tp = t + (*vj)*4;
726: while (nz--) {
727: /* xk += U(k,:)*x(:) */
728: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
729: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
730: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
731: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
732: vj++; tp = t + (*vj)*4;
733: v += 16;
734: }
735: tp = t + k*4;
736: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
737: idx = 4*r[k];
738: x[idx] = x0;
739: x[idx+1] = x1;
740: x[idx+2] = x2;
741: x[idx+3] = x3;
742: }
744: ISRestoreIndices(isrow,&r);
745: VecRestoreArray(bb,&b);
746: VecRestoreArray(xx,&x);
747: PetscLogFlops(16*(2*a->nz + mbs));
748: return(0);
749: }
751: /*
752: Special case where the matrix was factored in the natural ordering.
753: This eliminates the need for the column and row permutation.
754: */
757: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
758: {
759: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
760: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
761: MatScalar *aa=a->a,*v,*diag;
762: PetscScalar *x,*xp,*b,x0,x1,x2,x3;
764: PetscInt nz,*vj,k;
767:
768: VecGetArray(bb,&b);
769: VecGetArray(xx,&x);
771: /* solve U^T * D * y = b by forward substitution */
772: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
773: for (k=0; k<mbs; k++){
774: v = aa + 16*ai[k];
775: xp = x + k*4;
776: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
777: nz = ai[k+1] - ai[k];
778: vj = aj + ai[k];
779: xp = x + (*vj)*4;
780: while (nz--) {
781: /* x(:) += U(k,:)^T*(Dk*xk) */
782: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
783: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
784: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
785: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
786: vj++; xp = x + (*vj)*4;
787: v += 16;
788: }
789: /* xk = inv(Dk)*(Dk*xk) */
790: diag = aa+k*16; /* ptr to inv(Dk) */
791: xp = x + k*4;
792: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
793: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
794: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
795: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
796: }
798: /* solve U*x = y by back substitution */
799: for (k=mbs-1; k>=0; k--){
800: v = aa + 16*ai[k];
801: xp = x + k*4;
802: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
803: nz = ai[k+1] - ai[k];
804: vj = aj + ai[k];
805: xp = x + (*vj)*4;
806: while (nz--) {
807: /* xk += U(k,:)*x(:) */
808: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
809: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
810: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
811: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
812: vj++;
813: v += 16; xp = x + (*vj)*4;
814: }
815: xp = x + k*4;
816: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
817: }
819: VecRestoreArray(bb,&b);
820: VecRestoreArray(xx,&x);
821: PetscLogFlops(16*(2*a->nz + mbs));
822: return(0);
823: }
827: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
828: {
829: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
830: IS isrow=a->row;
831: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
833: PetscInt nz,*vj,k,*r,idx;
834: MatScalar *aa=a->a,*v,*diag;
835: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
838: VecGetArray(bb,&b);
839: VecGetArray(xx,&x);
840: t = a->solve_work;
841: ISGetIndices(isrow,&r);
843: /* solve U^T * D * y = b by forward substitution */
844: tp = t;
845: for (k=0; k<mbs; k++) { /* t <- perm(b) */
846: idx = 3*r[k];
847: tp[0] = b[idx];
848: tp[1] = b[idx+1];
849: tp[2] = b[idx+2];
850: tp += 3;
851: }
852:
853: for (k=0; k<mbs; k++){
854: v = aa + 9*ai[k];
855: vj = aj + ai[k];
856: tp = t + k*3;
857: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
858: nz = ai[k+1] - ai[k];
860: tp = t + (*vj)*3;
861: while (nz--) {
862: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
863: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
864: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
865: vj++; tp = t + (*vj)*3;
866: v += 9;
867: }
869: /* xk = inv(Dk)*(Dk*xk) */
870: diag = aa+k*9; /* ptr to inv(Dk) */
871: tp = t + k*3;
872: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
873: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
874: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
875: }
877: /* solve U*x = y by back substitution */
878: for (k=mbs-1; k>=0; k--){
879: v = aa + 9*ai[k];
880: vj = aj + ai[k];
881: tp = t + k*3;
882: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
883: nz = ai[k+1] - ai[k];
884:
885: tp = t + (*vj)*3;
886: while (nz--) {
887: /* xk += U(k,:)*x(:) */
888: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
889: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
890: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
891: vj++; tp = t + (*vj)*3;
892: v += 9;
893: }
894: tp = t + k*3;
895: tp[0] = x0; tp[1] = x1; tp[2] = x2;
896: idx = 3*r[k];
897: x[idx] = x0;
898: x[idx+1] = x1;
899: x[idx+2] = x2;
900: }
902: ISRestoreIndices(isrow,&r);
903: VecRestoreArray(bb,&b);
904: VecRestoreArray(xx,&x);
905: PetscLogFlops(9*(2*a->nz + mbs));
906: return(0);
907: }
909: /*
910: Special case where the matrix was factored in the natural ordering.
911: This eliminates the need for the column and row permutation.
912: */
915: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
916: {
917: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
918: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
919: MatScalar *aa=a->a,*v,*diag;
920: PetscScalar *x,*xp,*b,x0,x1,x2;
922: PetscInt nz,*vj,k;
925:
926: VecGetArray(bb,&b);
927: VecGetArray(xx,&x);
929: /* solve U^T * D * y = b by forward substitution */
930: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
931: for (k=0; k<mbs; k++){
932: v = aa + 9*ai[k];
933: xp = x + k*3;
934: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
935: nz = ai[k+1] - ai[k];
936: vj = aj + ai[k];
937: xp = x + (*vj)*3;
938: while (nz--) {
939: /* x(:) += U(k,:)^T*(Dk*xk) */
940: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
941: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
942: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
943: vj++; xp = x + (*vj)*3;
944: v += 9;
945: }
946: /* xk = inv(Dk)*(Dk*xk) */
947: diag = aa+k*9; /* ptr to inv(Dk) */
948: xp = x + k*3;
949: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
950: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
951: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
952: }
954: /* solve U*x = y by back substitution */
955: for (k=mbs-1; k>=0; k--){
956: v = aa + 9*ai[k];
957: xp = x + k*3;
958: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
959: nz = ai[k+1] - ai[k];
960: vj = aj + ai[k];
961: xp = x + (*vj)*3;
962: while (nz--) {
963: /* xk += U(k,:)*x(:) */
964: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
965: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
966: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
967: vj++;
968: v += 9; xp = x + (*vj)*3;
969: }
970: xp = x + k*3;
971: xp[0] = x0; xp[1] = x1; xp[2] = x2;
972: }
974: VecRestoreArray(bb,&b);
975: VecRestoreArray(xx,&x);
976: PetscLogFlops(9*(2*a->nz + mbs));
977: return(0);
978: }
982: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
983: {
984: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
985: IS isrow=a->row;
986: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
988: PetscInt nz,*vj,k,k2,*r,idx;
989: MatScalar *aa=a->a,*v,*diag;
990: PetscScalar *x,*b,x0,x1,*t;
993: VecGetArray(bb,&b);
994: VecGetArray(xx,&x);
995: t = a->solve_work;
996: /* printf("called MatSolve_SeqSBAIJ_2\n"); */
997: ISGetIndices(isrow,&r);
999: /* solve U^T * D * y = perm(b) by forward substitution */
1000: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1001: idx = 2*r[k];
1002: t[k*2] = b[idx];
1003: t[k*2+1] = b[idx+1];
1004: }
1005: for (k=0; k<mbs; k++){
1006: v = aa + 4*ai[k];
1007: vj = aj + ai[k];
1008: k2 = k*2;
1009: x0 = t[k2]; x1 = t[k2+1];
1010: nz = ai[k+1] - ai[k];
1011: while (nz--) {
1012: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1013: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1014: vj++; v += 4;
1015: }
1016: diag = aa+k*4; /* ptr to inv(Dk) */
1017: t[k2] = diag[0]*x0 + diag[2]*x1;
1018: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1019: }
1021: /* solve U*x = y by back substitution */
1022: for (k=mbs-1; k>=0; k--){
1023: v = aa + 4*ai[k];
1024: vj = aj + ai[k];
1025: k2 = k*2;
1026: x0 = t[k2]; x1 = t[k2+1];
1027: nz = ai[k+1] - ai[k];
1028: while (nz--) {
1029: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1030: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1031: vj++; v += 4;
1032: }
1033: t[k2] = x0;
1034: t[k2+1] = x1;
1035: idx = 2*r[k];
1036: x[idx] = x0;
1037: x[idx+1] = x1;
1038: }
1040: ISRestoreIndices(isrow,&r);
1041: VecRestoreArray(bb,&b);
1042: VecRestoreArray(xx,&x);
1043: PetscLogFlops(4*(2*a->nz + mbs));
1044: return(0);
1045: }
1047: /*
1048: Special case where the matrix was factored in the natural ordering.
1049: This eliminates the need for the column and row permutation.
1050: */
1053: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1054: {
1055: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1056: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1057: MatScalar *aa=a->a,*v,*diag;
1058: PetscScalar *x,*b,x0,x1;
1060: PetscInt nz,*vj,k,k2;
1063:
1064: VecGetArray(bb,&b);
1065: VecGetArray(xx,&x);
1067: /* solve U^T * D * y = b by forward substitution */
1068: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1069: for (k=0; k<mbs; k++){
1070: v = aa + 4*ai[k];
1071: vj = aj + ai[k];
1072: k2 = k*2;
1073: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1074: nz = ai[k+1] - ai[k];
1075:
1076: while (nz--) {
1077: /* x(:) += U(k,:)^T*(Dk*xk) */
1078: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1079: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1080: vj++; v += 4;
1081: }
1082: /* xk = inv(Dk)*(Dk*xk) */
1083: diag = aa+k*4; /* ptr to inv(Dk) */
1084: x[k2] = diag[0]*x0 + diag[2]*x1;
1085: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1086: }
1088: /* solve U*x = y by back substitution */
1089: for (k=mbs-1; k>=0; k--){
1090: v = aa + 4*ai[k];
1091: vj = aj + ai[k];
1092: k2 = k*2;
1093: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1094: nz = ai[k+1] - ai[k];
1095: while (nz--) {
1096: /* xk += U(k,:)*x(:) */
1097: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1098: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1099: vj++; v += 4;
1100: }
1101: x[k2] = x0;
1102: x[k2+1] = x1;
1103: }
1105: VecRestoreArray(bb,&b);
1106: VecRestoreArray(xx,&x);
1107: PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1108: return(0);
1109: }
1113: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1114: {
1115: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1116: IS isrow=a->row;
1118: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rip;
1119: MatScalar *aa=a->a,*v;
1120: PetscScalar *x,*b,xk,*t;
1121: PetscInt nz,*vj,k;
1124: if (!mbs) return(0);
1126: VecGetArray(bb,&b);
1127: VecGetArray(xx,&x);
1128: t = a->solve_work;
1130: ISGetIndices(isrow,&rip);
1131:
1132: /* solve U^T*D*y = perm(b) by forward substitution */
1133: for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1134: for (k=0; k<mbs; k++){
1135: v = aa + ai[k];
1136: vj = aj + ai[k];
1137: xk = t[k];
1138: nz = ai[k+1] - ai[k];
1139: while (nz--) t[*vj++] += (*v++) * xk;
1140: t[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */
1141: }
1143: /* solve U*x = y by back substitution */
1144: for (k=mbs-1; k>=0; k--){
1145: v = aa + ai[k];
1146: vj = aj + ai[k];
1147: xk = t[k];
1148: nz = ai[k+1] - ai[k];
1149: while (nz--) xk += (*v++) * t[*vj++];
1150: t[k] = xk;
1151: x[rip[k]] = xk;
1152: }
1154: ISRestoreIndices(isrow,&rip);
1155: VecRestoreArray(bb,&b);
1156: VecRestoreArray(xx,&x);
1157: PetscLogFlops(4*a->nz + A->m);
1158: return(0);
1159: }
1163: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1164: {
1165: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1169: if (A->bs == 1) {
1170: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1171: } else {
1172: IS isrow=a->row;
1173: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1174: MatScalar *aa=a->a,*v;
1175: PetscScalar *x,*b,*t;
1176: PetscInt nz,*vj,k,n;
1177: if (bb->n > a->solves_work_n) {
1178: if (a->solves_work) {PetscFree(a->solves_work);}
1179: PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1180: a->solves_work_n = bb->n;
1181: }
1182: n = bb->n;
1183: VecGetArray(bb->v,&b);
1184: VecGetArray(xx->v,&x);
1185: t = a->solves_work;
1187: ISGetIndices(isrow,&rip);
1188:
1189: /* solve U^T*D*y = perm(b) by forward substitution */
1190: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1191: for (k=0; k<mbs; k++){
1192: v = aa + ai[k];
1193: vj = aj + ai[k];
1194: nz = ai[k+1] - ai[k];
1195: while (nz--) {
1196: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1197: v++;vj++;
1198: }
1199: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1200: }
1201:
1202: /* solve U*x = y by back substitution */
1203: for (k=mbs-1; k>=0; k--){
1204: v = aa + ai[k];
1205: vj = aj + ai[k];
1206: nz = ai[k+1] - ai[k];
1207: while (nz--) {
1208: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1209: v++;vj++;
1210: }
1211: for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1212: }
1214: ISRestoreIndices(isrow,&rip);
1215: VecRestoreArray(bb->v,&b);
1216: VecRestoreArray(xx->v,&x);
1217: PetscLogFlops(bb->n*(4*a->nz + A->m));
1218: }
1219: return(0);
1220: }
1222: /*
1223: Special case where the matrix was ILU(0) factored in the natural
1224: ordering. This eliminates the need for the column and row permutation.
1225: */
1228: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1229: {
1230: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1232: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1233: MatScalar *aa=a->a,*v;
1234: PetscScalar *x,*b,xk;
1235: PetscInt nz,*vj,k;
1238: VecGetArray(bb,&b);
1239: VecGetArray(xx,&x);
1240:
1241: /* solve U^T*D*y = b by forward substitution */
1242: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1243: for (k=0; k<mbs; k++){
1244: v = aa + ai[k] + 1;
1245: vj = aj + ai[k] + 1;
1246: xk = x[k];
1247: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1248: while (nz--) x[*vj++] += (*v++) * xk;
1249: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1250: }
1252: /* solve U*x = y by back substitution */
1253: for (k=mbs-2; k>=0; k--){
1254: v = aa + ai[k] + 1;
1255: vj = aj + ai[k] + 1;
1256: xk = x[k];
1257: nz = ai[k+1] - ai[k] - 1;
1258: while (nz--) xk += (*v++) * x[*vj++];
1259: x[k] = xk;
1260: }
1262: VecRestoreArray(bb,&b);
1263: VecRestoreArray(xx,&x);
1264: PetscLogFlops(4*a->nz + A->m);
1265: return(0);
1266: }
1268: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1271: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1272: {
1273: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1275: PetscInt *rip,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1276: PetscInt *jutmp,bs = A->bs,bs2=a->bs2;
1277: PetscInt m,reallocs = 0,*levtmp;
1278: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1279: PetscInt incrlev,*lev,shift,prow,nz;
1280: PetscInt *il,ili,nextprow;
1281: PetscReal f = info->fill,levels = info->levels;
1282: PetscTruth perm_identity;
1285: /* check whether perm is the identity mapping */
1286: ISIdentity(perm,&perm_identity);
1288: /* special case that simply copies fill pattern */
1289: if (!levels && perm_identity && bs==1) {
1290: MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1291: (*B)->factor = FACTOR_CHOLESKY;
1292: b = (Mat_SeqSBAIJ*)(*B)->data;
1293: b->row = perm;
1294: b->icol = perm;
1295: b->factor_damping = info->damping;
1296: b->factor_shift = info->shift;
1297: b->factor_zeropivot = info->zeropivot;
1298: PetscObjectReference((PetscObject)perm);
1299: PetscObjectReference((PetscObject)perm);
1300: PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1301: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1302: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1303: return(0);
1304: }
1306: /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1307: if (levels > 0 && perm_identity && bs==1 ){
1308: if (!perm_identity) a->permute = PETSC_TRUE;
1309:
1310: ISGetIndices(perm,&rip);
1311:
1312: if (perm_identity){ /* without permutation */
1313: ai = a->i; aj = a->j;
1314: } else { /* non-trivial permutation */
1315: MatReorderingSeqSBAIJ(A,perm);
1316: ai = a->inew; aj = a->jnew;
1317: }
1318:
1319: /* initialization */
1320: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1321: umax = (PetscInt)(f*ai[mbs] + 1);
1322: PetscMalloc(umax*sizeof(PetscInt),&lev);
1323: PetscMalloc(umax*sizeof(PetscInt),&ju);
1324: iu[0] = 0;
1325: juidx = 0; /* index for ju */
1326: PetscMalloc((4*mbs+1)*sizeof(PetscInt),&jl); /* linked list for getting pivot row */
1327: q = jl + mbs; /* linked list for col index of active row */
1328: levtmp = q + mbs;
1329: il = levtmp + mbs;
1330: for (i=0; i<mbs; i++){
1331: jl[i] = mbs;
1332: q[i] = 0;
1333: il[i] = 0;
1334: }
1336: /* for each row k */
1337: for (k=0; k<mbs; k++){
1338: nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1339: q[k] = mbs;
1340: /* initialize nonzero structure of k-th row to row rip[k] of A */
1341: jmin = ai[rip[k]] +1; /* exclude diag[k] */
1342: jmax = ai[rip[k]+1];
1343: for (j=jmin; j<jmax; j++){
1344: vj = rip[aj[j]]; /* col. value */
1345: if(vj > k){
1346: qm = k;
1347: do {
1348: m = qm; qm = q[m];
1349: } while(qm < vj);
1350: if (qm == vj) {
1351: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1352: }
1353: nzk++;
1354: q[m] = vj;
1355: q[vj] = qm;
1356: levtmp[vj] = 0; /* initialize lev for nonzero element */
1357: } /* if(vj > k) */
1358: } /* for (j=jmin; j<jmax; j++) */
1360: /* modify nonzero structure of k-th row by computing fill-in
1361: for each row i to be merged in */
1362: prow = k;
1363: prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1364:
1365: while (prow < k){
1366: nextprow = jl[prow];
1367: /* merge row prow into k-th row */
1368: ili = il[prow];
1369: jmin = ili + 1; /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1370: jmax = iu[prow+1];
1371: qm = k;
1372: for (j=jmin; j<jmax; j++){
1373: vj = ju[j];
1374: incrlev = lev[j] + 1;
1375: if (incrlev > levels) continue;
1376: do {
1377: m = qm; qm = q[m];
1378: } while (qm < vj);
1379: if (qm != vj){ /* a fill */
1380: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1381: levtmp[vj] = incrlev;
1382: } else {
1383: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1384: }
1385: }
1386: if (jmin < jmax){ /* update il and jl */
1387: il[prow] = jmin;
1388: j = ju[jmin];
1389: jl[prow] = jl[j]; jl[j] = prow;
1390: }
1391: prow = nextprow;
1392: }
1393:
1394: /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1395: if (nzk > 0){
1396: i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1397: jl[k] = jl[i]; jl[i] = k;
1398: il[k] = iu[k] + 1;
1399: }
1400: iu[k+1] = iu[k] + nzk + 1; /* include diag[k] */
1402: /* allocate more space to ju if needed */
1403: if (iu[k+1] > umax) {
1404: /* estimate how much additional space we will need */
1405: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1406: /* just double the memory each time */
1407: maxadd = umax;
1408: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1409: umax += maxadd;
1411: /* allocate a longer ju */
1412: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1413: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1414: PetscFree(ju);
1415: ju = jutmp;
1417: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1418: PetscMemcpy(jutmp,lev,(iu[k])*sizeof(PetscInt));
1419: PetscFree(lev);
1420: lev = jutmp;
1421: reallocs += 2; /* count how many times we realloc */
1422: }
1424: /* save nonzero structure of k-th row in ju */
1425: ju[juidx] = k; /* diag[k] */
1426: lev[juidx] = 0;
1427: juidx++;
1428: i = k;
1429: while (nzk --) {
1430: i = q[i];
1431: ju[juidx] = i;
1432: lev[juidx] = levtmp[i];
1433: juidx++;
1434: }
1435: } /* end of for (k=0; k<mbs; k++) */
1437: if (ai[mbs] != 0) {
1438: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1439: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
1440: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1441: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1442: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1443: } else {
1444: PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1445: }
1447: ISRestoreIndices(perm,&rip);
1448: PetscFree(jl);
1449: PetscFree(lev);
1451: /* put together the new matrix */
1452: MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1453: MatSetType(*B,A->type_name);
1454: MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);
1456: /* PetscLogObjectParent(*B,iperm); */
1457: b = (Mat_SeqSBAIJ*)(*B)->data;
1458: PetscFree(b->imax);
1459: b->singlemalloc = PETSC_FALSE;
1460: /* the next line frees the default space generated by the Create() */
1461: PetscFree(b->a);
1462: PetscFree(b->ilen);
1463: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1464: b->j = ju;
1465: b->i = iu;
1466: b->diag = 0;
1467: b->ilen = 0;
1468: b->imax = 0;
1469: b->row = perm;
1470: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1471: PetscObjectReference((PetscObject)perm);
1472: b->icol = perm;
1473: PetscObjectReference((PetscObject)perm);
1474: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1475: /* In b structure: Free imax, ilen, old a, old j.
1476: Allocate idnew, solve_work, new a, new j */
1477: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1478: b->maxnz = b->nz = iu[mbs];
1479: b->factor_damping = info->damping;
1480: b->factor_shift = info->shift;
1481: b->factor_zeropivot = info->zeropivot;
1483: (*B)->factor = FACTOR_CHOLESKY;
1484: (*B)->info.factor_mallocs = reallocs;
1485: (*B)->info.fill_ratio_given = f;
1486: if (ai[mbs] != 0) {
1487: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1488: } else {
1489: (*B)->info.fill_ratio_needed = 0.0;
1490: }
1493: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1494: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1495: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1496:
1497: return(0);
1498: } /* end of if (levels > 0 && perm_identity && bs==1 ) */
1500: if (!perm_identity) a->permute = PETSC_TRUE;
1501: if (perm_identity){
1502: ai = a->i; aj = a->j;
1503: } else { /* non-trivial permutation */
1504: MatReorderingSeqSBAIJ(A, perm);
1505: ai = a->inew; aj = a->jnew;
1506: }
1507:
1508: /* initialization */
1509: ISGetIndices(perm,&rip);
1510: umax = (PetscInt)(f*ai[mbs] + 1);
1511: PetscMalloc(umax*sizeof(PetscInt),&lev);
1512: umax += mbs + 1;
1513: shift = mbs + 1;
1514: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1515: PetscMalloc(umax*sizeof(PetscInt),&ju);
1516: iu[0] = mbs + 1;
1517: juidx = mbs + 1;
1518: /* prowl: linked list for pivot row */
1519: PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1520: /* q: linked list for col index */
1521: q = prowl + mbs;
1522: levtmp = q + mbs;
1523:
1524: for (i=0; i<mbs; i++){
1525: prowl[i] = mbs;
1526: q[i] = 0;
1527: }
1529: /* for each row k */
1530: for (k=0; k<mbs; k++){
1531: nzk = 0;
1532: q[k] = mbs;
1533: /* copy current row into linked list */
1534: nz = ai[rip[k]+1] - ai[rip[k]];
1535: j = ai[rip[k]];
1536: while (nz--){
1537: vj = rip[aj[j++]];
1538: if (vj > k){
1539: qm = k;
1540: do {
1541: m = qm; qm = q[m];
1542: } while(qm < vj);
1543: if (qm == vj) {
1544: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1545: }
1546: nzk++;
1547: q[m] = vj;
1548: q[vj] = qm;
1549: levtmp[vj] = 0; /* initialize lev for nonzero element */
1550: }
1551: }
1553: /* modify nonzero structure of k-th row by computing fill-in
1554: for each row prow to be merged in */
1555: prow = k;
1556: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1557:
1558: while (prow < k){
1559: /* merge row prow into k-th row */
1560: jmin = iu[prow] + 1;
1561: jmax = iu[prow+1];
1562: qm = k;
1563: for (j=jmin; j<jmax; j++){
1564: incrlev = lev[j-shift] + 1;
1565: if (incrlev > levels) continue;
1567: vj = ju[j];
1568: do {
1569: m = qm; qm = q[m];
1570: } while (qm < vj);
1571: if (qm != vj){ /* a fill */
1572: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1573: levtmp[vj] = incrlev;
1574: } else {
1575: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1576: }
1577: }
1578: prow = prowl[prow]; /* next pivot row */
1579: }
1580:
1581: /* add k to row list for first nonzero element in k-th row */
1582: if (nzk > 1){
1583: i = q[k]; /* col value of first nonzero element in k_th row of U */
1584: prowl[k] = prowl[i]; prowl[i] = k;
1585: }
1586: iu[k+1] = iu[k] + nzk;
1588: /* allocate more space to ju and lev if needed */
1589: if (iu[k+1] > umax) {
1590: /* estimate how much additional space we will need */
1591: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1592: /* just double the memory each time */
1593: maxadd = umax;
1594: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1595: umax += maxadd;
1597: /* allocate a longer ju */
1598: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1599: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1600: PetscFree(ju);
1601: ju = jutmp;
1603: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1604: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
1605: PetscFree(lev);
1606: lev = jutmp;
1607: reallocs += 2; /* count how many times we realloc */
1608: }
1610: /* save nonzero structure of k-th row in ju */
1611: i=k;
1612: while (nzk --) {
1613: i = q[i];
1614: ju[juidx] = i;
1615: lev[juidx-shift] = levtmp[i];
1616: juidx++;
1617: }
1618: }
1619:
1620: if (ai[mbs] != 0) {
1621: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1622: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
1623: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1624: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1625: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1626: } else {
1627: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1628: }
1630: ISRestoreIndices(perm,&rip);
1631: PetscFree(prowl);
1632: PetscFree(lev);
1634: /* put together the new matrix */
1635: MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1636: MatSetType(*B,A->type_name);
1637: MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);
1639: /* PetscLogObjectParent(*B,iperm); */
1640: b = (Mat_SeqSBAIJ*)(*B)->data;
1641: PetscFree(b->imax);
1642: b->singlemalloc = PETSC_FALSE;
1643: /* the next line frees the default space generated by the Create() */
1644: PetscFree(b->a);
1645: PetscFree(b->ilen);
1646: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1647: b->j = ju;
1648: b->i = iu;
1649: b->diag = 0;
1650: b->ilen = 0;
1651: b->imax = 0;
1652:
1653: if (b->row) {
1654: ISDestroy(b->row);
1655: }
1656: if (b->icol) {
1657: ISDestroy(b->icol);
1658: }
1659: b->row = perm;
1660: b->icol = perm;
1661: PetscObjectReference((PetscObject)perm);
1662: PetscObjectReference((PetscObject)perm);
1663: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1664: /* In b structure: Free imax, ilen, old a, old j.
1665: Allocate idnew, solve_work, new a, new j */
1666: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1667: b->maxnz = b->nz = iu[mbs];
1668:
1669: (*B)->factor = FACTOR_CHOLESKY;
1670: (*B)->info.factor_mallocs = reallocs;
1671: (*B)->info.fill_ratio_given = f;
1672: if (ai[mbs] != 0) {
1673: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1674: } else {
1675: (*B)->info.fill_ratio_needed = 0.0;
1676: }
1678: if (perm_identity){
1679: switch (bs) {
1680: case 1:
1681: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1682: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1683: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1684: (*B)->ops->solves = MatSolves_SeqSBAIJ_1;
1685: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1686: break;
1687: case 2:
1688: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1689: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1690: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1691: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1692: break;
1693: case 3:
1694: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1695: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1696: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1697: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1698: break;
1699: case 4:
1700: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1701: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1702: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1703: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1704: break;
1705: case 5:
1706: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1707: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1708: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1709: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1710: break;
1711: case 6:
1712: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1713: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1714: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1715: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1716: break;
1717: case 7:
1718: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1719: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1720: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1721: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1722: break;
1723: default:
1724: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1725: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1726: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1727: PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1728: break;
1729: }
1730: }
1732: return(0);
1733: }