Actual source code: sbaijfact2.c

  1: /*
  2:     Factorization code for SBAIJ format. 
  3: */

 5:  #include src/mat/impls/sbaij/seq/sbaij.h
 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/inline/ilu.h
 8:  #include src/inline/dot.h

 12: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 13: {
 14:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 15:   IS              isrow=a->row;
 16:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
 17:   PetscErrorCode  ierr;
 18:   PetscInt        nz,*vj,k,*r,idx,k1;
 19:   PetscInt        bs=A->bs,bs2 = a->bs2;
 20:   MatScalar       *aa=a->a,*v,*diag;
 21:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

 24:   VecGetArray(bb,&b);
 25:   VecGetArray(xx,&x);
 26:   t  = a->solve_work;
 27:   ISGetIndices(isrow,&r);
 28:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 30:   /* solve U^T * D * y = b by forward substitution */
 31:   xk = t;
 32:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 33:     idx   = bs*r[k];
 34:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 35:   }
 36:   for (k=0; k<mbs; k++){
 37:     v  = aa + bs2*ai[k];
 38:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
 39:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 40:     nz = ai[k+1] - ai[k];
 41:     vj = aj + ai[k];
 42:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
 43:     while (nz--) {
 44:       /* x(:) += U(k,:)^T*(Dk*xk) */
 45:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 46:       vj++; xj = t + (*vj)*bs;
 47:       v += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 51:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k=mbs-1; k>=0; k--){
 56:     v  = aa + bs2*ai[k];
 57:     xk = t + k*bs;        /* xk */
 58:     nz = ai[k+1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj)*bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2; xj = t + (*vj)*bs;
 66:     }
 67:     idx   = bs*r[k];
 68:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow,&r);
 73:   VecRestoreArray(bb,&b);
 74:   VecRestoreArray(xx,&x);
 75:   PetscLogFlops(bs2*(2*a->nz + mbs));
 76:   return(0);
 77: }

 81: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
 82: {
 83:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
 85:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
 86:   PetscInt       nz,*vj,k;
 87:   PetscInt       bs=A->bs,bs2 = a->bs2;
 88:   MatScalar      *aa=a->a,*v,*diag;
 89:   PetscScalar    *x,*xk,*xj,*b,*xk_tmp;

 92: 
 93:   VecGetArray(bb,&b);
 94:   VecGetArray(xx,&x);

 96:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 98:   /* solve U^T * D * y = b by forward substitution */
 99:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
100:   for (k=0; k<mbs; k++){
101:     v  = aa + bs2*ai[k];
102:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
103:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
104:     nz = ai[k+1] - ai[k];
105:     vj = aj + ai[k];
106:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
107:     while (nz--) {
108:       /* x(:) += U(k,:)^T*(Dk*xk) */
109:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
110:       vj++; xj = x + (*vj)*bs;
111:       v += bs2;
112:     }
113:     /* xk = inv(Dk)*(Dk*xk) */
114:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
115:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
116:   }

118:   /* solve U*x = y by back substitution */
119:   for (k=mbs-1; k>=0; k--){
120:     v  = aa + bs2*ai[k];
121:     xk = x + k*bs;        /* xk */
122:     nz = ai[k+1] - ai[k];
123:     vj = aj + ai[k];
124:     xj = x + (*vj)*bs;
125:     while (nz--) {
126:       /* xk += U(k,:)*x(:) */
127:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
128:       vj++;
129:       v += bs2; xj = x + (*vj)*bs;
130:     }
131:   }

133:   PetscFree(xk_tmp);
134:   VecRestoreArray(bb,&b);
135:   VecRestoreArray(xx,&x);
136:   PetscLogFlops(bs2*(2*a->nz + mbs));
137:   return(0);
138: }

142: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
143: {
144:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
145:   IS             isrow=a->row;
146:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
148:   PetscInt       nz,*vj,k,*r,idx;
149:   MatScalar      *aa=a->a,*v,*d;
150:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

153:   VecGetArray(bb,&b);
154:   VecGetArray(xx,&x);
155:   t  = a->solve_work;
156:   ISGetIndices(isrow,&r);

158:   /* solve U^T * D * y = b by forward substitution */
159:   tp = t;
160:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
161:     idx   = 7*r[k];
162:     tp[0] = b[idx];
163:     tp[1] = b[idx+1];
164:     tp[2] = b[idx+2];
165:     tp[3] = b[idx+3];
166:     tp[4] = b[idx+4];
167:     tp[5] = b[idx+5];
168:     tp[6] = b[idx+6];
169:     tp += 7;
170:   }
171: 
172:   for (k=0; k<mbs; k++){
173:     v  = aa + 49*ai[k];
174:     vj = aj + ai[k];
175:     tp = t + k*7;
176:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
177:     nz = ai[k+1] - ai[k];
178:     tp = t + (*vj)*7;
179:     while (nz--) {
180:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
181:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
182:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
183:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
184:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
185:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
186:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
187:       vj++; tp = t + (*vj)*7;
188:       v += 49;
189:     }

191:     /* xk = inv(Dk)*(Dk*xk) */
192:     d  = aa+k*49;          /* ptr to inv(Dk) */
193:     tp    = t + k*7;
194:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
195:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
196:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
197:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
198:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
199:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
200:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
201:   }

203:   /* solve U*x = y by back substitution */
204:   for (k=mbs-1; k>=0; k--){
205:     v  = aa + 49*ai[k];
206:     vj = aj + ai[k];
207:     tp    = t + k*7;
208:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
209:     nz = ai[k+1] - ai[k];
210: 
211:     tp = t + (*vj)*7;
212:     while (nz--) {
213:       /* xk += U(k,:)*x(:) */
214:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
215:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
216:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
217:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
218:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
219:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
220:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
221:       vj++; tp = t + (*vj)*7;
222:       v += 49;
223:     }
224:     tp    = t + k*7;
225:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
226:     idx   = 7*r[k];
227:     x[idx]     = x0;
228:     x[idx+1]   = x1;
229:     x[idx+2]   = x2;
230:     x[idx+3]   = x3;
231:     x[idx+4]   = x4;
232:     x[idx+5]   = x5;
233:     x[idx+6]   = x6;
234:   }

236:   ISRestoreIndices(isrow,&r);
237:   VecRestoreArray(bb,&b);
238:   VecRestoreArray(xx,&x);
239:   PetscLogFlops(49*(2*a->nz + mbs));
240:   return(0);
241: }

245: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
246: {
247:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
249:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
250:   MatScalar      *aa=a->a,*v,*d;
251:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
252:   PetscInt       nz,*vj,k;

255:   VecGetArray(bb,&b);
256:   VecGetArray(xx,&x);
257: 
258:   /* solve U^T * D * y = b by forward substitution */
259:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
260:   for (k=0; k<mbs; k++){
261:     v  = aa + 49*ai[k];
262:     xp = x + k*7;
263:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
264:     nz = ai[k+1] - ai[k];
265:     vj = aj + ai[k];
266:     xp = x + (*vj)*7;
267:     while (nz--) {
268:       /* x(:) += U(k,:)^T*(Dk*xk) */
269:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
270:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
271:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
272:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
273:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
274:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
275:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
276:       vj++; xp = x + (*vj)*7;
277:       v += 49;
278:     }
279:     /* xk = inv(Dk)*(Dk*xk) */
280:     d  = aa+k*49;          /* ptr to inv(Dk) */
281:     xp = x + k*7;
282:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
283:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
284:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
285:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
286:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
287:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
288:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
289:   }

291:   /* solve U*x = y by back substitution */
292:   for (k=mbs-1; k>=0; k--){
293:     v  = aa + 49*ai[k];
294:     xp = x + k*7;
295:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
296:     nz = ai[k+1] - ai[k];
297:     vj = aj + ai[k];
298:     xp = x + (*vj)*7;
299:     while (nz--) {
300:       /* xk += U(k,:)*x(:) */
301:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
302:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
303:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
304:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
305:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
306:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
307:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
308:       vj++;
309:       v += 49; xp = x + (*vj)*7;
310:     }
311:     xp = x + k*7;
312:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
313:   }

315:   VecRestoreArray(bb,&b);
316:   VecRestoreArray(xx,&x);
317:   PetscLogFlops(49*(2*a->nz + mbs));
318:   return(0);
319: }

323: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
324: {
325:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
326:   IS             isrow=a->row;
327:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
329:   PetscInt       nz,*vj,k,*r,idx;
330:   MatScalar      *aa=a->a,*v,*d;
331:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

334:   VecGetArray(bb,&b);
335:   VecGetArray(xx,&x);
336:   t  = a->solve_work;
337:   ISGetIndices(isrow,&r);

339:   /* solve U^T * D * y = b by forward substitution */
340:   tp = t;
341:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
342:     idx   = 6*r[k];
343:     tp[0] = b[idx];
344:     tp[1] = b[idx+1];
345:     tp[2] = b[idx+2];
346:     tp[3] = b[idx+3];
347:     tp[4] = b[idx+4];
348:     tp[5] = b[idx+5];
349:     tp += 6;
350:   }
351: 
352:   for (k=0; k<mbs; k++){
353:     v  = aa + 36*ai[k];
354:     vj = aj + ai[k];
355:     tp = t + k*6;
356:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
357:     nz = ai[k+1] - ai[k];
358:     tp = t + (*vj)*6;
359:     while (nz--) {
360:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
361:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
362:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
363:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
364:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
365:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
366:       vj++; tp = t + (*vj)*6;
367:       v += 36;
368:     }

370:     /* xk = inv(Dk)*(Dk*xk) */
371:     d  = aa+k*36;          /* ptr to inv(Dk) */
372:     tp    = t + k*6;
373:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
374:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
375:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
376:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
377:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
378:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
379:   }

381:   /* solve U*x = y by back substitution */
382:   for (k=mbs-1; k>=0; k--){
383:     v  = aa + 36*ai[k];
384:     vj = aj + ai[k];
385:     tp    = t + k*6;
386:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
387:     nz = ai[k+1] - ai[k];
388: 
389:     tp = t + (*vj)*6;
390:     while (nz--) {
391:       /* xk += U(k,:)*x(:) */
392:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
393:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
394:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
395:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
396:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
397:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
398:       vj++; tp = t + (*vj)*6;
399:       v += 36;
400:     }
401:     tp    = t + k*6;
402:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
403:     idx   = 6*r[k];
404:     x[idx]     = x0;
405:     x[idx+1]   = x1;
406:     x[idx+2]   = x2;
407:     x[idx+3]   = x3;
408:     x[idx+4]   = x4;
409:     x[idx+5]   = x5;
410:   }

412:   ISRestoreIndices(isrow,&r);
413:   VecRestoreArray(bb,&b);
414:   VecRestoreArray(xx,&x);
415:   PetscLogFlops(36*(2*a->nz + mbs));
416:   return(0);
417: }

421: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
422: {
423:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
424:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
425:   MatScalar      *aa=a->a,*v,*d;
426:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4,x5;
428:   PetscInt       nz,*vj,k;

431: 
432:   VecGetArray(bb,&b);
433:   VecGetArray(xx,&x);
434: 
435:   /* solve U^T * D * y = b by forward substitution */
436:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
437:   for (k=0; k<mbs; k++){
438:     v  = aa + 36*ai[k];
439:     xp = x + k*6;
440:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
441:     nz = ai[k+1] - ai[k];
442:     vj = aj + ai[k];
443:     xp = x + (*vj)*6;
444:     while (nz--) {
445:       /* x(:) += U(k,:)^T*(Dk*xk) */
446:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
447:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
448:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
449:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
450:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
451:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
452:       vj++; xp = x + (*vj)*6;
453:       v += 36;
454:     }
455:     /* xk = inv(Dk)*(Dk*xk) */
456:     d  = aa+k*36;          /* ptr to inv(Dk) */
457:     xp = x + k*6;
458:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
459:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
460:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
461:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
462:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
463:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
464:   }

466:   /* solve U*x = y by back substitution */
467:   for (k=mbs-1; k>=0; k--){
468:     v  = aa + 36*ai[k];
469:     xp = x + k*6;
470:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
471:     nz = ai[k+1] - ai[k];
472:     vj = aj + ai[k];
473:     xp = x + (*vj)*6;
474:     while (nz--) {
475:       /* xk += U(k,:)*x(:) */
476:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
477:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
478:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
479:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
480:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
481:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
482:       vj++;
483:       v += 36; xp = x + (*vj)*6;
484:     }
485:     xp = x + k*6;
486:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
487:   }

489:   VecRestoreArray(bb,&b);
490:   VecRestoreArray(xx,&x);
491:   PetscLogFlops(36*(2*a->nz + mbs));
492:   return(0);
493: }

497: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
498: {
499:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
500:   IS             isrow=a->row;
501:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
503:   PetscInt       nz,*vj,k,*r,idx;
504:   MatScalar      *aa=a->a,*v,*diag;
505:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

508:   VecGetArray(bb,&b);
509:   VecGetArray(xx,&x);
510:   t  = a->solve_work;
511:   ISGetIndices(isrow,&r);

513:   /* solve U^T * D * y = b by forward substitution */
514:   tp = t;
515:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
516:     idx   = 5*r[k];
517:     tp[0] = b[idx];
518:     tp[1] = b[idx+1];
519:     tp[2] = b[idx+2];
520:     tp[3] = b[idx+3];
521:     tp[4] = b[idx+4];
522:     tp += 5;
523:   }
524: 
525:   for (k=0; k<mbs; k++){
526:     v  = aa + 25*ai[k];
527:     vj = aj + ai[k];
528:     tp = t + k*5;
529:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
530:     nz = ai[k+1] - ai[k];

532:     tp = t + (*vj)*5;
533:     while (nz--) {
534:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
535:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
536:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
537:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
538:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
539:       vj++; tp = t + (*vj)*5;
540:       v += 25;
541:     }

543:     /* xk = inv(Dk)*(Dk*xk) */
544:     diag  = aa+k*25;          /* ptr to inv(Dk) */
545:     tp    = t + k*5;
546:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
547:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
548:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
549:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
550:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
551:   }

553:   /* solve U*x = y by back substitution */
554:   for (k=mbs-1; k>=0; k--){
555:     v  = aa + 25*ai[k];
556:     vj = aj + ai[k];
557:     tp    = t + k*5;
558:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
559:     nz = ai[k+1] - ai[k];
560: 
561:     tp = t + (*vj)*5;
562:     while (nz--) {
563:       /* xk += U(k,:)*x(:) */
564:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
565:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
566:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
567:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
568:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
569:       vj++; tp = t + (*vj)*5;
570:       v += 25;
571:     }
572:     tp    = t + k*5;
573:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
574:     idx   = 5*r[k];
575:     x[idx]     = x0;
576:     x[idx+1]   = x1;
577:     x[idx+2]   = x2;
578:     x[idx+3]   = x3;
579:     x[idx+4]   = x4;
580:   }

582:   ISRestoreIndices(isrow,&r);
583:   VecRestoreArray(bb,&b);
584:   VecRestoreArray(xx,&x);
585:   PetscLogFlops(25*(2*a->nz + mbs));
586:   return(0);
587: }

591: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
592: {
593:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
594:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
595:   MatScalar      *aa=a->a,*v,*diag;
596:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4;
598:   PetscInt       nz,*vj,k;

601: 
602:   VecGetArray(bb,&b);
603:   VecGetArray(xx,&x);

605:   /* solve U^T * D * y = b by forward substitution */
606:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
607:   for (k=0; k<mbs; k++){
608:     v  = aa + 25*ai[k];
609:     xp = x + k*5;
610:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
611:     nz = ai[k+1] - ai[k];
612:     vj = aj + ai[k];
613:     xp = x + (*vj)*5;
614:     while (nz--) {
615:       /* x(:) += U(k,:)^T*(Dk*xk) */
616:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
617:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
618:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
619:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
620:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
621:       vj++; xp = x + (*vj)*5;
622:       v += 25;
623:     }
624:     /* xk = inv(Dk)*(Dk*xk) */
625:     diag = aa+k*25;          /* ptr to inv(Dk) */
626:     xp   = x + k*5;
627:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
628:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
629:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
630:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
631:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
632:   }

634:   /* solve U*x = y by back substitution */
635:   for (k=mbs-1; k>=0; k--){
636:     v  = aa + 25*ai[k];
637:     xp = x + k*5;
638:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
639:     nz = ai[k+1] - ai[k];
640:     vj = aj + ai[k];
641:     xp = x + (*vj)*5;
642:     while (nz--) {
643:       /* xk += U(k,:)*x(:) */
644:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
645:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
646:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
647:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
648:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
649:       vj++;
650:       v += 25; xp = x + (*vj)*5;
651:     }
652:     xp = x + k*5;
653:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
654:   }

656:   VecRestoreArray(bb,&b);
657:   VecRestoreArray(xx,&x);
658:   PetscLogFlops(25*(2*a->nz + mbs));
659:   return(0);
660: }

664: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
665: {
666:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
667:   IS             isrow=a->row;
668:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
670:   PetscInt       nz,*vj,k,*r,idx;
671:   MatScalar      *aa=a->a,*v,*diag;
672:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

675:   VecGetArray(bb,&b);
676:   VecGetArray(xx,&x);
677:   t  = a->solve_work;
678:   ISGetIndices(isrow,&r);

680:   /* solve U^T * D * y = b by forward substitution */
681:   tp = t;
682:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
683:     idx   = 4*r[k];
684:     tp[0] = b[idx];
685:     tp[1] = b[idx+1];
686:     tp[2] = b[idx+2];
687:     tp[3] = b[idx+3];
688:     tp += 4;
689:   }
690: 
691:   for (k=0; k<mbs; k++){
692:     v  = aa + 16*ai[k];
693:     vj = aj + ai[k];
694:     tp = t + k*4;
695:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
696:     nz = ai[k+1] - ai[k];

698:     tp = t + (*vj)*4;
699:     while (nz--) {
700:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
701:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
702:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
703:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
704:       vj++; tp = t + (*vj)*4;
705:       v += 16;
706:     }

708:     /* xk = inv(Dk)*(Dk*xk) */
709:     diag  = aa+k*16;          /* ptr to inv(Dk) */
710:     tp    = t + k*4;
711:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
712:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
713:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
714:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
715:   }

717:   /* solve U*x = y by back substitution */
718:   for (k=mbs-1; k>=0; k--){
719:     v  = aa + 16*ai[k];
720:     vj = aj + ai[k];
721:     tp    = t + k*4;
722:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
723:     nz = ai[k+1] - ai[k];
724: 
725:     tp = t + (*vj)*4;
726:     while (nz--) {
727:       /* xk += U(k,:)*x(:) */
728:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
729:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
730:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
731:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
732:       vj++; tp = t + (*vj)*4;
733:       v += 16;
734:     }
735:     tp    = t + k*4;
736:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
737:     idx        = 4*r[k];
738:     x[idx]     = x0;
739:     x[idx+1]   = x1;
740:     x[idx+2]   = x2;
741:     x[idx+3]   = x3;
742:   }

744:   ISRestoreIndices(isrow,&r);
745:   VecRestoreArray(bb,&b);
746:   VecRestoreArray(xx,&x);
747:   PetscLogFlops(16*(2*a->nz + mbs));
748:   return(0);
749: }

751: /*
752:    Special case where the matrix was factored in the natural ordering. 
753:    This eliminates the need for the column and row permutation.
754: */
757: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
758: {
759:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
760:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
761:   MatScalar      *aa=a->a,*v,*diag;
762:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3;
764:   PetscInt       nz,*vj,k;

767: 
768:   VecGetArray(bb,&b);
769:   VecGetArray(xx,&x);

771:   /* solve U^T * D * y = b by forward substitution */
772:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
773:   for (k=0; k<mbs; k++){
774:     v  = aa + 16*ai[k];
775:     xp = x + k*4;
776:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
777:     nz = ai[k+1] - ai[k];
778:     vj = aj + ai[k];
779:     xp = x + (*vj)*4;
780:     while (nz--) {
781:       /* x(:) += U(k,:)^T*(Dk*xk) */
782:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
783:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
784:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
785:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
786:       vj++; xp = x + (*vj)*4;
787:       v += 16;
788:     }
789:     /* xk = inv(Dk)*(Dk*xk) */
790:     diag = aa+k*16;          /* ptr to inv(Dk) */
791:     xp   = x + k*4;
792:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
793:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
794:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
795:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
796:   }

798:   /* solve U*x = y by back substitution */
799:   for (k=mbs-1; k>=0; k--){
800:     v  = aa + 16*ai[k];
801:     xp = x + k*4;
802:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
803:     nz = ai[k+1] - ai[k];
804:     vj = aj + ai[k];
805:     xp = x + (*vj)*4;
806:     while (nz--) {
807:       /* xk += U(k,:)*x(:) */
808:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
809:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
810:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
811:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
812:       vj++;
813:       v += 16; xp = x + (*vj)*4;
814:     }
815:     xp = x + k*4;
816:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
817:   }

819:   VecRestoreArray(bb,&b);
820:   VecRestoreArray(xx,&x);
821:   PetscLogFlops(16*(2*a->nz + mbs));
822:   return(0);
823: }

827: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
828: {
829:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
830:   IS             isrow=a->row;
831:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
833:   PetscInt       nz,*vj,k,*r,idx;
834:   MatScalar      *aa=a->a,*v,*diag;
835:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

838:   VecGetArray(bb,&b);
839:   VecGetArray(xx,&x);
840:   t  = a->solve_work;
841:   ISGetIndices(isrow,&r);

843:   /* solve U^T * D * y = b by forward substitution */
844:   tp = t;
845:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
846:     idx   = 3*r[k];
847:     tp[0] = b[idx];
848:     tp[1] = b[idx+1];
849:     tp[2] = b[idx+2];
850:     tp += 3;
851:   }
852: 
853:   for (k=0; k<mbs; k++){
854:     v  = aa + 9*ai[k];
855:     vj = aj + ai[k];
856:     tp = t + k*3;
857:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
858:     nz = ai[k+1] - ai[k];

860:     tp = t + (*vj)*3;
861:     while (nz--) {
862:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
863:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
864:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
865:       vj++; tp = t + (*vj)*3;
866:       v += 9;
867:     }

869:     /* xk = inv(Dk)*(Dk*xk) */
870:     diag  = aa+k*9;          /* ptr to inv(Dk) */
871:     tp    = t + k*3;
872:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
873:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
874:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
875:   }

877:   /* solve U*x = y by back substitution */
878:   for (k=mbs-1; k>=0; k--){
879:     v  = aa + 9*ai[k];
880:     vj = aj + ai[k];
881:     tp    = t + k*3;
882:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
883:     nz = ai[k+1] - ai[k];
884: 
885:     tp = t + (*vj)*3;
886:     while (nz--) {
887:       /* xk += U(k,:)*x(:) */
888:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
889:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
890:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
891:       vj++; tp = t + (*vj)*3;
892:       v += 9;
893:     }
894:     tp    = t + k*3;
895:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
896:     idx      = 3*r[k];
897:     x[idx]   = x0;
898:     x[idx+1] = x1;
899:     x[idx+2] = x2;
900:   }

902:   ISRestoreIndices(isrow,&r);
903:   VecRestoreArray(bb,&b);
904:   VecRestoreArray(xx,&x);
905:   PetscLogFlops(9*(2*a->nz + mbs));
906:   return(0);
907: }

909: /*
910:    Special case where the matrix was factored in the natural ordering. 
911:    This eliminates the need for the column and row permutation.
912: */
915: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
916: {
917:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
918:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
919:   MatScalar      *aa=a->a,*v,*diag;
920:   PetscScalar    *x,*xp,*b,x0,x1,x2;
922:   PetscInt       nz,*vj,k;

925: 
926:   VecGetArray(bb,&b);
927:   VecGetArray(xx,&x);

929:   /* solve U^T * D * y = b by forward substitution */
930:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
931:   for (k=0; k<mbs; k++){
932:     v  = aa + 9*ai[k];
933:     xp = x + k*3;
934:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
935:     nz = ai[k+1] - ai[k];
936:     vj = aj + ai[k];
937:     xp = x + (*vj)*3;
938:     while (nz--) {
939:       /* x(:) += U(k,:)^T*(Dk*xk) */
940:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
941:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
942:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
943:       vj++; xp = x + (*vj)*3;
944:       v += 9;
945:     }
946:     /* xk = inv(Dk)*(Dk*xk) */
947:     diag = aa+k*9;          /* ptr to inv(Dk) */
948:     xp   = x + k*3;
949:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
950:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
951:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
952:   }

954:   /* solve U*x = y by back substitution */
955:   for (k=mbs-1; k>=0; k--){
956:     v  = aa + 9*ai[k];
957:     xp = x + k*3;
958:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
959:     nz = ai[k+1] - ai[k];
960:     vj = aj + ai[k];
961:     xp = x + (*vj)*3;
962:     while (nz--) {
963:       /* xk += U(k,:)*x(:) */
964:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
965:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
966:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
967:       vj++;
968:       v += 9; xp = x + (*vj)*3;
969:     }
970:     xp = x + k*3;
971:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
972:   }

974:   VecRestoreArray(bb,&b);
975:   VecRestoreArray(xx,&x);
976:   PetscLogFlops(9*(2*a->nz + mbs));
977:   return(0);
978: }

982: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
983: {
984:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
985:   IS             isrow=a->row;
986:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
988:   PetscInt       nz,*vj,k,k2,*r,idx;
989:   MatScalar      *aa=a->a,*v,*diag;
990:   PetscScalar    *x,*b,x0,x1,*t;

993:   VecGetArray(bb,&b);
994:   VecGetArray(xx,&x);
995:   t  = a->solve_work;
996:   /* printf("called MatSolve_SeqSBAIJ_2\n"); */
997:   ISGetIndices(isrow,&r);

999:   /* solve U^T * D * y = perm(b) by forward substitution */
1000:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1001:     idx = 2*r[k];
1002:     t[k*2]   = b[idx];
1003:     t[k*2+1] = b[idx+1];
1004:   }
1005:   for (k=0; k<mbs; k++){
1006:     v  = aa + 4*ai[k];
1007:     vj = aj + ai[k];
1008:     k2 = k*2;
1009:     x0 = t[k2]; x1 = t[k2+1];
1010:     nz = ai[k+1] - ai[k];
1011:     while (nz--) {
1012:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1013:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1014:       vj++; v += 4;
1015:     }
1016:     diag = aa+k*4;  /* ptr to inv(Dk) */
1017:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1018:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1019:   }

1021:   /* solve U*x = y by back substitution */
1022:   for (k=mbs-1; k>=0; k--){
1023:     v  = aa + 4*ai[k];
1024:     vj = aj + ai[k];
1025:     k2 = k*2;
1026:     x0 = t[k2]; x1 = t[k2+1];
1027:     nz = ai[k+1] - ai[k];
1028:     while (nz--) {
1029:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1030:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1031:       vj++; v += 4;
1032:     }
1033:     t[k2]    = x0;
1034:     t[k2+1]  = x1;
1035:     idx      = 2*r[k];
1036:     x[idx]   = x0;
1037:     x[idx+1] = x1;
1038:   }

1040:   ISRestoreIndices(isrow,&r);
1041:   VecRestoreArray(bb,&b);
1042:   VecRestoreArray(xx,&x);
1043:   PetscLogFlops(4*(2*a->nz + mbs));
1044:   return(0);
1045: }

1047: /*
1048:    Special case where the matrix was factored in the natural ordering. 
1049:    This eliminates the need for the column and row permutation.
1050: */
1053: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1054: {
1055:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1056:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1057:   MatScalar      *aa=a->a,*v,*diag;
1058:   PetscScalar    *x,*b,x0,x1;
1060:   PetscInt       nz,*vj,k,k2;

1063: 
1064:   VecGetArray(bb,&b);
1065:   VecGetArray(xx,&x);

1067:   /* solve U^T * D * y = b by forward substitution */
1068:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1069:   for (k=0; k<mbs; k++){
1070:     v  = aa + 4*ai[k];
1071:     vj = aj + ai[k];
1072:     k2 = k*2;
1073:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1074:     nz = ai[k+1] - ai[k];
1075: 
1076:     while (nz--) {
1077:       /* x(:) += U(k,:)^T*(Dk*xk) */
1078:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1079:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1080:       vj++; v += 4;
1081:     }
1082:     /* xk = inv(Dk)*(Dk*xk) */
1083:     diag = aa+k*4;          /* ptr to inv(Dk) */
1084:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1085:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1086:   }

1088:   /* solve U*x = y by back substitution */
1089:   for (k=mbs-1; k>=0; k--){
1090:     v  = aa + 4*ai[k];
1091:     vj = aj + ai[k];
1092:     k2 = k*2;
1093:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1094:     nz = ai[k+1] - ai[k];
1095:     while (nz--) {
1096:       /* xk += U(k,:)*x(:) */
1097:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1098:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1099:       vj++; v += 4;
1100:     }
1101:     x[k2]     = x0;
1102:     x[k2+1]   = x1;
1103:   }

1105:   VecRestoreArray(bb,&b);
1106:   VecRestoreArray(xx,&x);
1107:   PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1108:   return(0);
1109: }

1113: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1114: {
1115:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1116:   IS             isrow=a->row;
1118:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*rip;
1119:   MatScalar      *aa=a->a,*v;
1120:   PetscScalar    *x,*b,xk,*t;
1121:   PetscInt       nz,*vj,k;

1124:   if (!mbs) return(0);

1126:   VecGetArray(bb,&b);
1127:   VecGetArray(xx,&x);
1128:   t    = a->solve_work;

1130:   ISGetIndices(isrow,&rip);
1131: 
1132:   /* solve U^T*D*y = perm(b) by forward substitution */
1133:   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1134:   for (k=0; k<mbs; k++){
1135:     v  = aa + ai[k];
1136:     vj = aj + ai[k];
1137:     xk = t[k];
1138:     nz = ai[k+1] - ai[k];
1139:     while (nz--) t[*vj++] += (*v++) * xk;
1140:     t[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1141:   }

1143:   /* solve U*x = y by back substitution */
1144:   for (k=mbs-1; k>=0; k--){
1145:     v  = aa + ai[k];
1146:     vj = aj + ai[k];
1147:     xk = t[k];
1148:     nz = ai[k+1] - ai[k];
1149:     while (nz--) xk += (*v++) * t[*vj++];
1150:     t[k]      = xk;
1151:     x[rip[k]] = xk;
1152:   }

1154:   ISRestoreIndices(isrow,&rip);
1155:   VecRestoreArray(bb,&b);
1156:   VecRestoreArray(xx,&x);
1157:   PetscLogFlops(4*a->nz + A->m);
1158:   return(0);
1159: }

1163: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1164: {
1165:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1169:   if (A->bs == 1) {
1170:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1171:   } else {
1172:     IS              isrow=a->row;
1173:     PetscInt             mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1174:     MatScalar       *aa=a->a,*v;
1175:     PetscScalar     *x,*b,*t;
1176:     PetscInt             nz,*vj,k,n;
1177:     if (bb->n > a->solves_work_n) {
1178:       if (a->solves_work) {PetscFree(a->solves_work);}
1179:       PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1180:       a->solves_work_n = bb->n;
1181:     }
1182:     n    = bb->n;
1183:     VecGetArray(bb->v,&b);
1184:     VecGetArray(xx->v,&x);
1185:     t    = a->solves_work;

1187:     ISGetIndices(isrow,&rip);
1188: 
1189:     /* solve U^T*D*y = perm(b) by forward substitution */
1190:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1191:     for (k=0; k<mbs; k++){
1192:       v  = aa + ai[k];
1193:       vj = aj + ai[k];
1194:       nz = ai[k+1] - ai[k];
1195:       while (nz--) {
1196:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1197:         v++;vj++;
1198:       }
1199:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1200:     }
1201: 
1202:     /* solve U*x = y by back substitution */
1203:     for (k=mbs-1; k>=0; k--){
1204:       v  = aa + ai[k];
1205:       vj = aj + ai[k];
1206:       nz = ai[k+1] - ai[k];
1207:       while (nz--) {
1208:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1209:         v++;vj++;
1210:       }
1211:       for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1212:     }

1214:     ISRestoreIndices(isrow,&rip);
1215:     VecRestoreArray(bb->v,&b);
1216:     VecRestoreArray(xx->v,&x);
1217:     PetscLogFlops(bb->n*(4*a->nz + A->m));
1218:   }
1219:   return(0);
1220: }

1222: /*
1223:       Special case where the matrix was ILU(0) factored in the natural
1224:    ordering. This eliminates the need for the column and row permutation.
1225: */
1228: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1229: {
1230:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1232:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1233:   MatScalar      *aa=a->a,*v;
1234:   PetscScalar    *x,*b,xk;
1235:   PetscInt       nz,*vj,k;

1238:   VecGetArray(bb,&b);
1239:   VecGetArray(xx,&x);
1240: 
1241:   /* solve U^T*D*y = b by forward substitution */
1242:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1243:   for (k=0; k<mbs; k++){
1244:     v  = aa + ai[k] + 1;
1245:     vj = aj + ai[k] + 1;
1246:     xk = x[k];
1247:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1248:     while (nz--) x[*vj++] += (*v++) * xk;
1249:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1250:   }

1252:   /* solve U*x = y by back substitution */
1253:   for (k=mbs-2; k>=0; k--){
1254:     v  = aa + ai[k] + 1;
1255:     vj = aj + ai[k] + 1;
1256:     xk = x[k];
1257:     nz = ai[k+1] - ai[k] - 1;
1258:     while (nz--) xk += (*v++) * x[*vj++];
1259:     x[k] = xk;
1260:   }

1262:   VecRestoreArray(bb,&b);
1263:   VecRestoreArray(xx,&x);
1264:   PetscLogFlops(4*a->nz + A->m);
1265:   return(0);
1266: }

1268: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1271: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1272: {
1273:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
1275:   PetscInt       *rip,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1276:   PetscInt       *jutmp,bs = A->bs,bs2=a->bs2;
1277:   PetscInt       m,reallocs = 0,*levtmp;
1278:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1279:   PetscInt       incrlev,*lev,shift,prow,nz;
1280:   PetscInt       *il,ili,nextprow;
1281:   PetscReal      f = info->fill,levels = info->levels;
1282:   PetscTruth     perm_identity;

1285:   /* check whether perm is the identity mapping */
1286:   ISIdentity(perm,&perm_identity);

1288:   /* special case that simply copies fill pattern */
1289:   if (!levels && perm_identity && bs==1) {
1290:     MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1291:     (*B)->factor    = FACTOR_CHOLESKY;
1292:     b               = (Mat_SeqSBAIJ*)(*B)->data;
1293:     b->row          = perm;
1294:     b->icol         = perm;
1295:     b->factor_damping   = info->damping;
1296:     b->factor_shift     = info->shift;
1297:     b->factor_zeropivot = info->zeropivot;
1298:     PetscObjectReference((PetscObject)perm);
1299:     PetscObjectReference((PetscObject)perm);
1300:     PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1301:     (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1302:     (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1303:     return(0);
1304:   }

1306:   /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1307:   if (levels > 0 && perm_identity && bs==1 ){
1308:     if (!perm_identity) a->permute = PETSC_TRUE;
1309: 
1310:   ISGetIndices(perm,&rip);
1311: 
1312:   if (perm_identity){ /* without permutation */
1313:     ai = a->i; aj = a->j;
1314:   } else {            /* non-trivial permutation */
1315:     MatReorderingSeqSBAIJ(A,perm);
1316:     ai = a->inew; aj = a->jnew;
1317:   }
1318: 
1319:   /* initialization */
1320:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1321:   umax  = (PetscInt)(f*ai[mbs] + 1);
1322:   PetscMalloc(umax*sizeof(PetscInt),&lev);
1323:   PetscMalloc(umax*sizeof(PetscInt),&ju);
1324:   iu[0] = 0;
1325:   juidx = 0; /* index for ju */
1326:   PetscMalloc((4*mbs+1)*sizeof(PetscInt),&jl); /* linked list for getting pivot row */
1327:   q      = jl + mbs;   /* linked list for col index of active row */
1328:   levtmp = q + mbs;
1329:   il     = levtmp + mbs;
1330:   for (i=0; i<mbs; i++){
1331:     jl[i] = mbs;
1332:     q[i]  = 0;
1333:     il[i] = 0;
1334:   }

1336:   /* for each row k */
1337:   for (k=0; k<mbs; k++){
1338:     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1339:     q[k] = mbs;
1340:     /* initialize nonzero structure of k-th row to row rip[k] of A */
1341:     jmin = ai[rip[k]] +1; /* exclude diag[k] */
1342:     jmax = ai[rip[k]+1];
1343:     for (j=jmin; j<jmax; j++){
1344:       vj = rip[aj[j]]; /* col. value */
1345:       if(vj > k){
1346:         qm = k;
1347:         do {
1348:           m  = qm; qm = q[m];
1349:         } while(qm < vj);
1350:         if (qm == vj) {
1351:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1352:         }
1353:         nzk++;
1354:         q[m]  = vj;
1355:         q[vj] = qm;
1356:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1357:       } /* if(vj > k) */
1358:     } /* for (j=jmin; j<jmax; j++) */

1360:     /* modify nonzero structure of k-th row by computing fill-in
1361:        for each row i to be merged in */
1362:     prow = k;
1363:     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1364: 
1365:     while (prow < k){
1366:       nextprow = jl[prow];
1367:       /* merge row prow into k-th row */
1368:       ili  = il[prow];
1369:       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1370:       jmax = iu[prow+1];
1371:       qm   = k;
1372:       for (j=jmin; j<jmax; j++){
1373:         vj = ju[j];
1374:         incrlev = lev[j] + 1;
1375:         if (incrlev > levels) continue;
1376:         do {
1377:           m = qm; qm = q[m];
1378:         } while (qm < vj);
1379:         if (qm != vj){  /* a fill */
1380:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1381:           levtmp[vj]  = incrlev;
1382:         } else {
1383:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1384:         }
1385:       }
1386:       if (jmin < jmax){ /* update il and jl */
1387:         il[prow] = jmin;
1388:         j = ju[jmin];
1389:         jl[prow] = jl[j]; jl[j] = prow;
1390:       }
1391:       prow = nextprow;
1392:     }
1393: 
1394:     /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1395:     if (nzk > 0){
1396:       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1397:       jl[k] = jl[i]; jl[i] = k;
1398:       il[k] = iu[k] + 1;
1399:     }
1400:     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */

1402:     /* allocate more space to ju if needed */
1403:     if (iu[k+1] > umax) {
1404:       /* estimate how much additional space we will need */
1405:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1406:       /* just double the memory each time */
1407:       maxadd = umax;
1408:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1409:       umax += maxadd;

1411:       /* allocate a longer ju */
1412:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1413:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1414:       PetscFree(ju);
1415:       ju   = jutmp;

1417:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1418:       PetscMemcpy(jutmp,lev,(iu[k])*sizeof(PetscInt));
1419:       PetscFree(lev);
1420:       lev      = jutmp;
1421:       reallocs += 2; /* count how many times we realloc */
1422:     }

1424:     /* save nonzero structure of k-th row in ju */
1425:     ju[juidx]  = k; /* diag[k] */
1426:     lev[juidx] = 0;
1427:     juidx++;
1428:     i = k;
1429:     while (nzk --) {
1430:       i           = q[i];
1431:       ju[juidx] = i;
1432:       lev[juidx] = levtmp[i];
1433:       juidx++;
1434:     }
1435:   } /* end of for (k=0; k<mbs; k++) */

1437:   if (ai[mbs] != 0) {
1438:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1439:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
1440:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1441:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1442:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1443:   } else {
1444:      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1445:   }

1447:   ISRestoreIndices(perm,&rip);
1448:   PetscFree(jl);
1449:   PetscFree(lev);

1451:   /* put together the new matrix */
1452:   MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1453:   MatSetType(*B,A->type_name);
1454:   MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);

1456:   /* PetscLogObjectParent(*B,iperm); */
1457:   b = (Mat_SeqSBAIJ*)(*B)->data;
1458:   PetscFree(b->imax);
1459:   b->singlemalloc = PETSC_FALSE;
1460:   /* the next line frees the default space generated by the Create() */
1461:   PetscFree(b->a);
1462:   PetscFree(b->ilen);
1463:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1464:   b->j    = ju;
1465:   b->i    = iu;
1466:   b->diag = 0;
1467:   b->ilen = 0;
1468:   b->imax = 0;
1469:   b->row  = perm;
1470:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1471:   PetscObjectReference((PetscObject)perm);
1472:   b->icol = perm;
1473:   PetscObjectReference((PetscObject)perm);
1474:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1475:   /* In b structure:  Free imax, ilen, old a, old j.  
1476:      Allocate idnew, solve_work, new a, new j */
1477:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1478:   b->maxnz          = b->nz = iu[mbs];
1479:   b->factor_damping   = info->damping;
1480:   b->factor_shift     = info->shift;
1481:   b->factor_zeropivot = info->zeropivot;

1483:   (*B)->factor                 = FACTOR_CHOLESKY;
1484:   (*B)->info.factor_mallocs    = reallocs;
1485:   (*B)->info.fill_ratio_given  = f;
1486:   if (ai[mbs] != 0) {
1487:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1488:   } else {
1489:     (*B)->info.fill_ratio_needed = 0.0;
1490:   }


1493:   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1494:   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1495:   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1496: 
1497:   return(0);
1498:   } /* end of if (levels > 0 && perm_identity && bs==1 ) */

1500:   if (!perm_identity) a->permute = PETSC_TRUE;
1501:   if (perm_identity){
1502:     ai = a->i; aj = a->j;
1503:   } else { /*  non-trivial permutation */
1504:     MatReorderingSeqSBAIJ(A, perm);
1505:     ai = a->inew; aj = a->jnew;
1506:   }
1507: 
1508:   /* initialization */
1509:   ISGetIndices(perm,&rip);
1510:   umax  = (PetscInt)(f*ai[mbs] + 1);
1511:   PetscMalloc(umax*sizeof(PetscInt),&lev);
1512:   umax += mbs + 1;
1513:   shift = mbs + 1;
1514:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1515:   PetscMalloc(umax*sizeof(PetscInt),&ju);
1516:   iu[0] = mbs + 1;
1517:   juidx = mbs + 1;
1518:   /* prowl: linked list for pivot row */
1519:   PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1520:   /* q: linked list for col index */
1521:   q       = prowl + mbs;
1522:   levtmp  = q     + mbs;
1523: 
1524:   for (i=0; i<mbs; i++){
1525:     prowl[i] = mbs;
1526:     q[i] = 0;
1527:   }

1529:   /* for each row k */
1530:   for (k=0; k<mbs; k++){
1531:     nzk  = 0;
1532:     q[k] = mbs;
1533:     /* copy current row into linked list */
1534:     nz = ai[rip[k]+1] - ai[rip[k]];
1535:     j = ai[rip[k]];
1536:     while (nz--){
1537:       vj = rip[aj[j++]];
1538:       if (vj > k){
1539:         qm = k;
1540:         do {
1541:           m  = qm; qm = q[m];
1542:         } while(qm < vj);
1543:         if (qm == vj) {
1544:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1545:         }
1546:         nzk++;
1547:         q[m]       = vj;
1548:         q[vj]      = qm;
1549:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1550:       }
1551:     }

1553:     /* modify nonzero structure of k-th row by computing fill-in
1554:        for each row prow to be merged in */
1555:     prow = k;
1556:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1557: 
1558:     while (prow < k){
1559:       /* merge row prow into k-th row */
1560:       jmin = iu[prow] + 1;
1561:       jmax = iu[prow+1];
1562:       qm = k;
1563:       for (j=jmin; j<jmax; j++){
1564:         incrlev = lev[j-shift] + 1;
1565:         if (incrlev > levels) continue;

1567:         vj      = ju[j];
1568:         do {
1569:           m = qm; qm = q[m];
1570:         } while (qm < vj);
1571:         if (qm != vj){      /* a fill */
1572:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1573:           levtmp[vj] = incrlev;
1574:         } else {
1575:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1576:         }
1577:       }
1578:       prow = prowl[prow]; /* next pivot row */
1579:     }
1580: 
1581:     /* add k to row list for first nonzero element in k-th row */
1582:     if (nzk > 1){
1583:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1584:       prowl[k] = prowl[i]; prowl[i] = k;
1585:     }
1586:     iu[k+1] = iu[k] + nzk;

1588:     /* allocate more space to ju and lev if needed */
1589:     if (iu[k+1] > umax) {
1590:       /* estimate how much additional space we will need */
1591:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1592:       /* just double the memory each time */
1593:       maxadd = umax;
1594:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1595:       umax += maxadd;

1597:       /* allocate a longer ju */
1598:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1599:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1600:       PetscFree(ju);
1601:       ju       = jutmp;

1603:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1604:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
1605:       PetscFree(lev);
1606:       lev      = jutmp;
1607:       reallocs += 2; /* count how many times we realloc */
1608:     }

1610:     /* save nonzero structure of k-th row in ju */
1611:     i=k;
1612:     while (nzk --) {
1613:       i                = q[i];
1614:       ju[juidx]        = i;
1615:       lev[juidx-shift] = levtmp[i];
1616:       juidx++;
1617:     }
1618:   }
1619: 
1620:   if (ai[mbs] != 0) {
1621:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1622:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocs,f,af);
1623:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1624:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1625:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1626:   } else {
1627:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1628:   }

1630:   ISRestoreIndices(perm,&rip);
1631:   PetscFree(prowl);
1632:   PetscFree(lev);

1634:   /* put together the new matrix */
1635:   MatCreate(A->comm,bs*mbs,bs*mbs,bs*mbs,bs*mbs,B);
1636:   MatSetType(*B,A->type_name);
1637:   MatSeqSBAIJSetPreallocation(*B,bs,0,PETSC_NULL);

1639:   /* PetscLogObjectParent(*B,iperm); */
1640:   b    = (Mat_SeqSBAIJ*)(*B)->data;
1641:   PetscFree(b->imax);
1642:   b->singlemalloc = PETSC_FALSE;
1643:   /* the next line frees the default space generated by the Create() */
1644:   PetscFree(b->a);
1645:   PetscFree(b->ilen);
1646:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1647:   b->j    = ju;
1648:   b->i    = iu;
1649:   b->diag = 0;
1650:   b->ilen = 0;
1651:   b->imax = 0;
1652: 
1653:   if (b->row) {
1654:     ISDestroy(b->row);
1655:   }
1656:   if (b->icol) {
1657:     ISDestroy(b->icol);
1658:   }
1659:   b->row  = perm;
1660:   b->icol = perm;
1661:   PetscObjectReference((PetscObject)perm);
1662:   PetscObjectReference((PetscObject)perm);
1663:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1664:   /* In b structure:  Free imax, ilen, old a, old j.  
1665:      Allocate idnew, solve_work, new a, new j */
1666:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1667:   b->maxnz = b->nz = iu[mbs];
1668: 
1669:   (*B)->factor                 = FACTOR_CHOLESKY;
1670:   (*B)->info.factor_mallocs    = reallocs;
1671:   (*B)->info.fill_ratio_given  = f;
1672:   if (ai[mbs] != 0) {
1673:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1674:   } else {
1675:     (*B)->info.fill_ratio_needed = 0.0;
1676:   }

1678:   if (perm_identity){
1679:     switch (bs) {
1680:       case 1:
1681:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1682:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1683:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1684:         (*B)->ops->solves                = MatSolves_SeqSBAIJ_1;
1685:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1686:         break;
1687:       case 2:
1688:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1689:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1690:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1691:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1692:         break;
1693:       case 3:
1694:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1695:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1696:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1697:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1698:         break;
1699:       case 4:
1700:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1701:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1702:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1703:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1704:         break;
1705:       case 5:
1706:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1707:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1708:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1709:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1710:         break;
1711:       case 6:
1712:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1713:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1714:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1715:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1716:         break;
1717:       case 7:
1718:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1719:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1720:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1721:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1722:       break;
1723:       default:
1724:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1725:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1726:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1727:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1728:       break;
1729:     }
1730:   }

1732:   return(0);
1733: }