Actual source code: sbaijfact2.c

  1: /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
  2: /*
  3:     Factorization code for SBAIJ format. 
  4: */

 6:  #include src/mat/impls/sbaij/seq/sbaij.h
 7:  #include src/mat/impls/baij/seq/baij.h
 8:  #include src/vec/vecimpl.h
 9:  #include src/inline/ilu.h
 10:  #include src/inline/dot.h

 14: int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 15: {
 17:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 18: }

 22: int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 23: {
 25:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 26: }

 30: int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
 31: {
 33:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 34: }

 38: int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
 39: {
 41:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 42: }

 46: int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
 47: {
 49:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 50: }

 54: int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
 55: {
 57:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 58: }

 62: int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
 63: {
 65:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 66: }

 70: int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
 71: {
 73:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 74: }

 78: int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
 79: {
 81:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 82: }

 86: int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
 87: {
 89:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 90: }

 94: int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
 95: {
 97:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
 98: }

102: int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
103: {
105:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
106: }

110: int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
111: {
113:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
114: }

118: int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
119: {
121:   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
122: }

126: int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
127: {
128:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
129:   IS              isrow=a->row;
130:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
131:   int             nz,*vj,k,ierr,*r,idx,k1;
132:   int             bs=a->bs,bs2 = a->bs2;
133:   MatScalar       *aa=a->a,*v,*diag;
134:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

137:   VecGetArray(bb,&b);
138:   VecGetArray(xx,&x);
139:   t  = a->solve_work;
140:   ISGetIndices(isrow,&r);
141:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

143:   /* solve U^T * D * y = b by forward substitution */
144:   xk = t;
145:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
146:     idx   = bs*r[k];
147:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
148:   }
149:   for (k=0; k<mbs; k++){
150:     v  = aa + bs2*ai[k];
151:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
152:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
153:     nz = ai[k+1] - ai[k];
154:     vj = aj + ai[k];
155:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
156:     while (nz--) {
157:       /* x(:) += U(k,:)^T*(Dk*xk) */
158:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
159:       vj++; xj = t + (*vj)*bs;
160:       v += bs2;
161:     }
162:     /* xk = inv(Dk)*(Dk*xk) */
163:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
164:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
165:   }

167:   /* solve U*x = y by back substitution */
168:   for (k=mbs-1; k>=0; k--){
169:     v  = aa + bs2*ai[k];
170:     xk = t + k*bs;        /* xk */
171:     nz = ai[k+1] - ai[k];
172:     vj = aj + ai[k];
173:     xj = t + (*vj)*bs;
174:     while (nz--) {
175:       /* xk += U(k,:)*x(:) */
176:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
177:       vj++;
178:       v += bs2; xj = t + (*vj)*bs;
179:     }
180:     idx   = bs*r[k];
181:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
182:   }

184:   PetscFree(xk_tmp);
185:   VecRestoreArray(bb,&b);
186:   VecRestoreArray(xx,&x);
187:   PetscLogFlops(bs2*(2*a->s_nz + mbs));
188:   return(0);
189: }

193: int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
194: {
195:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
196:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
197:   int             nz,*vj,k,ierr;
198:   int             bs=a->bs,bs2 = a->bs2;
199:   MatScalar       *aa=a->a,*v,*diag;
200:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp;

203: 
204:   VecGetArray(bb,&b);
205:   VecGetArray(xx,&x);

207:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

209:   /* solve U^T * D * y = b by forward substitution */
210:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
211:   for (k=0; k<mbs; k++){
212:     v  = aa + bs2*ai[k];
213:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
214:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
215:     nz = ai[k+1] - ai[k];
216:     vj = aj + ai[k];
217:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
218:     while (nz--) {
219:       /* x(:) += U(k,:)^T*(Dk*xk) */
220:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
221:       vj++; xj = x + (*vj)*bs;
222:       v += bs2;
223:     }
224:     /* xk = inv(Dk)*(Dk*xk) */
225:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
226:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
227:   }

229:   /* solve U*x = y by back substitution */
230:   for (k=mbs-1; k>=0; k--){
231:     v  = aa + bs2*ai[k];
232:     xk = x + k*bs;        /* xk */
233:     nz = ai[k+1] - ai[k];
234:     vj = aj + ai[k];
235:     xj = x + (*vj)*bs;
236:     while (nz--) {
237:       /* xk += U(k,:)*x(:) */
238:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
239:       vj++;
240:       v += bs2; xj = x + (*vj)*bs;
241:     }
242:   }

244:   PetscFree(xk_tmp);
245:   VecRestoreArray(bb,&b);
246:   VecRestoreArray(xx,&x);
247:   PetscLogFlops(bs2*(2*a->s_nz + mbs));
248:   return(0);
249: }

253: int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
254: {
255:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
256:   IS              isrow=a->row;
257:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
258:   int             nz,*vj,k,ierr,*r,idx;
259:   MatScalar       *aa=a->a,*v,*d;
260:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

263:   VecGetArray(bb,&b);
264:   VecGetArray(xx,&x);
265:   t  = a->solve_work;
266:   ISGetIndices(isrow,&r);

268:   /* solve U^T * D * y = b by forward substitution */
269:   tp = t;
270:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
271:     idx   = 7*r[k];
272:     tp[0] = b[idx];
273:     tp[1] = b[idx+1];
274:     tp[2] = b[idx+2];
275:     tp[3] = b[idx+3];
276:     tp[4] = b[idx+4];
277:     tp[5] = b[idx+5];
278:     tp[6] = b[idx+6];
279:     tp += 7;
280:   }
281: 
282:   for (k=0; k<mbs; k++){
283:     v  = aa + 49*ai[k];
284:     vj = aj + ai[k];
285:     tp = t + k*7;
286:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
287:     nz = ai[k+1] - ai[k];
288:     tp = t + (*vj)*7;
289:     while (nz--) {
290:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
291:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
292:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
293:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
294:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
295:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
296:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
297:       vj++; tp = t + (*vj)*7;
298:       v += 49;
299:     }

301:     /* xk = inv(Dk)*(Dk*xk) */
302:     d  = aa+k*49;          /* ptr to inv(Dk) */
303:     tp    = t + k*7;
304:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
305:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
306:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
307:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
308:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
309:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
310:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
311:   }

313:   /* solve U*x = y by back substitution */
314:   for (k=mbs-1; k>=0; k--){
315:     v  = aa + 49*ai[k];
316:     vj = aj + ai[k];
317:     tp    = t + k*7;
318:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
319:     nz = ai[k+1] - ai[k];
320: 
321:     tp = t + (*vj)*7;
322:     while (nz--) {
323:       /* xk += U(k,:)*x(:) */
324:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
325:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
326:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
327:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
328:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
329:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
330:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
331:       vj++; tp = t + (*vj)*7;
332:       v += 49;
333:     }
334:     tp    = t + k*7;
335:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
336:     idx   = 7*r[k];
337:     x[idx]     = x0;
338:     x[idx+1]   = x1;
339:     x[idx+2]   = x2;
340:     x[idx+3]   = x3;
341:     x[idx+4]   = x4;
342:     x[idx+5]   = x5;
343:     x[idx+6]   = x6;
344:   }

346:   VecRestoreArray(bb,&b);
347:   VecRestoreArray(xx,&x);
348:   PetscLogFlops(49*(2*a->s_nz + mbs));
349:   return(0);
350: }

354: int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
355: {
356:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
357:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
358:   MatScalar       *aa=a->a,*v,*d;
359:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
360:   int             nz,*vj,k,ierr;

363: 
364:   VecGetArray(bb,&b);
365:   VecGetArray(xx,&x);
366: 
367:   /* solve U^T * D * y = b by forward substitution */
368:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
369:   for (k=0; k<mbs; k++){
370:     v  = aa + 49*ai[k];
371:     xp = x + k*7;
372:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
373:     nz = ai[k+1] - ai[k];
374:     vj = aj + ai[k];
375:     xp = x + (*vj)*7;
376:     while (nz--) {
377:       /* x(:) += U(k,:)^T*(Dk*xk) */
378:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
379:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
380:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
381:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
382:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
383:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
384:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
385:       vj++; xp = x + (*vj)*7;
386:       v += 49;
387:     }
388:     /* xk = inv(Dk)*(Dk*xk) */
389:     d  = aa+k*49;          /* ptr to inv(Dk) */
390:     xp = x + k*7;
391:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
392:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
393:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
394:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
395:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
396:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
397:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
398:   }

400:   /* solve U*x = y by back substitution */
401:   for (k=mbs-1; k>=0; k--){
402:     v  = aa + 49*ai[k];
403:     xp = x + k*7;
404:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
405:     nz = ai[k+1] - ai[k];
406:     vj = aj + ai[k];
407:     xp = x + (*vj)*7;
408:     while (nz--) {
409:       /* xk += U(k,:)*x(:) */
410:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
411:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
412:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
413:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
414:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
415:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
416:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
417:       vj++;
418:       v += 49; xp = x + (*vj)*7;
419:     }
420:     xp = x + k*7;
421:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
422:   }

424:   VecRestoreArray(bb,&b);
425:   VecRestoreArray(xx,&x);
426:   PetscLogFlops(49*(2*a->s_nz + mbs));
427:   return(0);
428: }

432: int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
433: {
434:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
435:   IS              isrow=a->row;
436:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
437:   int             nz,*vj,k,ierr,*r,idx;
438:   MatScalar       *aa=a->a,*v,*d;
439:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

442:   VecGetArray(bb,&b);
443:   VecGetArray(xx,&x);
444:   t  = a->solve_work;
445:   ISGetIndices(isrow,&r);

447:   /* solve U^T * D * y = b by forward substitution */
448:   tp = t;
449:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
450:     idx   = 6*r[k];
451:     tp[0] = b[idx];
452:     tp[1] = b[idx+1];
453:     tp[2] = b[idx+2];
454:     tp[3] = b[idx+3];
455:     tp[4] = b[idx+4];
456:     tp[5] = b[idx+5];
457:     tp += 6;
458:   }
459: 
460:   for (k=0; k<mbs; k++){
461:     v  = aa + 36*ai[k];
462:     vj = aj + ai[k];
463:     tp = t + k*6;
464:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
465:     nz = ai[k+1] - ai[k];
466:     tp = t + (*vj)*6;
467:     while (nz--) {
468:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
469:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
470:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
471:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
472:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
473:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
474:       vj++; tp = t + (*vj)*6;
475:       v += 36;
476:     }

478:     /* xk = inv(Dk)*(Dk*xk) */
479:     d  = aa+k*36;          /* ptr to inv(Dk) */
480:     tp    = t + k*6;
481:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
482:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
483:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
484:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
485:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
486:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
487:   }

489:   /* solve U*x = y by back substitution */
490:   for (k=mbs-1; k>=0; k--){
491:     v  = aa + 36*ai[k];
492:     vj = aj + ai[k];
493:     tp    = t + k*6;
494:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
495:     nz = ai[k+1] - ai[k];
496: 
497:     tp = t + (*vj)*6;
498:     while (nz--) {
499:       /* xk += U(k,:)*x(:) */
500:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
501:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
502:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
503:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
504:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
505:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
506:       vj++; tp = t + (*vj)*6;
507:       v += 36;
508:     }
509:     tp    = t + k*6;
510:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
511:     idx   = 6*r[k];
512:     x[idx]     = x0;
513:     x[idx+1]   = x1;
514:     x[idx+2]   = x2;
515:     x[idx+3]   = x3;
516:     x[idx+4]   = x4;
517:     x[idx+5]   = x5;
518:   }

520:   VecRestoreArray(bb,&b);
521:   VecRestoreArray(xx,&x);
522:   PetscLogFlops(36*(2*a->s_nz + mbs));
523:   return(0);
524: }

528: int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
529: {
530:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
531:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
532:   MatScalar       *aa=a->a,*v,*d;
533:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5;
534:   int             nz,*vj,k,ierr;

537: 
538:   VecGetArray(bb,&b);
539:   VecGetArray(xx,&x);
540: 
541:   /* solve U^T * D * y = b by forward substitution */
542:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
543:   for (k=0; k<mbs; k++){
544:     v  = aa + 36*ai[k];
545:     xp = x + k*6;
546:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
547:     nz = ai[k+1] - ai[k];
548:     vj = aj + ai[k];
549:     xp = x + (*vj)*6;
550:     while (nz--) {
551:       /* x(:) += U(k,:)^T*(Dk*xk) */
552:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
553:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
554:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
555:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
556:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
557:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
558:       vj++; xp = x + (*vj)*6;
559:       v += 36;
560:     }
561:     /* xk = inv(Dk)*(Dk*xk) */
562:     d  = aa+k*36;          /* ptr to inv(Dk) */
563:     xp = x + k*6;
564:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
565:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
566:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
567:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
568:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
569:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
570:   }

572:   /* solve U*x = y by back substitution */
573:   for (k=mbs-1; k>=0; k--){
574:     v  = aa + 36*ai[k];
575:     xp = x + k*6;
576:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
577:     nz = ai[k+1] - ai[k];
578:     vj = aj + ai[k];
579:     xp = x + (*vj)*6;
580:     while (nz--) {
581:       /* xk += U(k,:)*x(:) */
582:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
583:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
584:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
585:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
586:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
587:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
588:       vj++;
589:       v += 36; xp = x + (*vj)*6;
590:     }
591:     xp = x + k*6;
592:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
593:   }

595:   VecRestoreArray(bb,&b);
596:   VecRestoreArray(xx,&x);
597:   PetscLogFlops(36*(2*a->s_nz + mbs));
598:   return(0);
599: }

603: int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
604: {
605:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
606:   IS              isrow=a->row;
607:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
608:   int             nz,*vj,k,ierr,*r,idx;
609:   MatScalar       *aa=a->a,*v,*diag;
610:   PetscScalar     *x,*b,x0,x1,x2,x3,x4,*t,*tp;

613:   VecGetArray(bb,&b);
614:   VecGetArray(xx,&x);
615:   t  = a->solve_work;
616:   ISGetIndices(isrow,&r);

618:   /* solve U^T * D * y = b by forward substitution */
619:   tp = t;
620:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
621:     idx   = 5*r[k];
622:     tp[0] = b[idx];
623:     tp[1] = b[idx+1];
624:     tp[2] = b[idx+2];
625:     tp[3] = b[idx+3];
626:     tp[4] = b[idx+4];
627:     tp += 5;
628:   }
629: 
630:   for (k=0; k<mbs; k++){
631:     v  = aa + 25*ai[k];
632:     vj = aj + ai[k];
633:     tp = t + k*5;
634:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
635:     nz = ai[k+1] - ai[k];

637:     tp = t + (*vj)*5;
638:     while (nz--) {
639:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
640:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
641:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
642:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
643:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
644:       vj++; tp = t + (*vj)*5;
645:       v += 25;
646:     }

648:     /* xk = inv(Dk)*(Dk*xk) */
649:     diag  = aa+k*25;          /* ptr to inv(Dk) */
650:     tp    = t + k*5;
651:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
652:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
653:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
654:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
655:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
656:   }

658:   /* solve U*x = y by back substitution */
659:   for (k=mbs-1; k>=0; k--){
660:     v  = aa + 25*ai[k];
661:     vj = aj + ai[k];
662:     tp    = t + k*5;
663:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
664:     nz = ai[k+1] - ai[k];
665: 
666:     tp = t + (*vj)*5;
667:     while (nz--) {
668:       /* xk += U(k,:)*x(:) */
669:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
670:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
671:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
672:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
673:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
674:       vj++; tp = t + (*vj)*5;
675:       v += 25;
676:     }
677:     tp    = t + k*5;
678:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
679:     idx   = 5*r[k];
680:     x[idx]     = x0;
681:     x[idx+1]   = x1;
682:     x[idx+2]   = x2;
683:     x[idx+3]   = x3;
684:     x[idx+4]   = x4;
685:   }

687:   VecRestoreArray(bb,&b);
688:   VecRestoreArray(xx,&x);
689:   PetscLogFlops(25*(2*a->s_nz + mbs));
690:   return(0);
691: }

695: int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
696: {
697:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
698:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
699:   MatScalar       *aa=a->a,*v,*diag;
700:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4;
701:   int             nz,*vj,k,ierr;

704: 
705:   VecGetArray(bb,&b);
706:   VecGetArray(xx,&x);

708:   /* solve U^T * D * y = b by forward substitution */
709:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
710:   for (k=0; k<mbs; k++){
711:     v  = aa + 25*ai[k];
712:     xp = x + k*5;
713:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
714:     nz = ai[k+1] - ai[k];
715:     vj = aj + ai[k];
716:     xp = x + (*vj)*5;
717:     while (nz--) {
718:       /* x(:) += U(k,:)^T*(Dk*xk) */
719:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
720:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
721:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
722:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
723:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
724:       vj++; xp = x + (*vj)*5;
725:       v += 25;
726:     }
727:     /* xk = inv(Dk)*(Dk*xk) */
728:     diag = aa+k*25;          /* ptr to inv(Dk) */
729:     xp   = x + k*5;
730:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
731:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
732:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
733:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
734:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
735:   }

737:   /* solve U*x = y by back substitution */
738:   for (k=mbs-1; k>=0; k--){
739:     v  = aa + 25*ai[k];
740:     xp = x + k*5;
741:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
742:     nz = ai[k+1] - ai[k];
743:     vj = aj + ai[k];
744:     xp = x + (*vj)*5;
745:     while (nz--) {
746:       /* xk += U(k,:)*x(:) */
747:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
748:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
749:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
750:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
751:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
752:       vj++;
753:       v += 25; xp = x + (*vj)*5;
754:     }
755:     xp = x + k*5;
756:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
757:   }

759:   VecRestoreArray(bb,&b);
760:   VecRestoreArray(xx,&x);
761:   PetscLogFlops(25*(2*a->s_nz + mbs));
762:   return(0);
763: }

767: int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
768: {
769:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
770:   IS              isrow=a->row;
771:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
772:   int             nz,*vj,k,ierr,*r,idx;
773:   MatScalar       *aa=a->a,*v,*diag;
774:   PetscScalar     *x,*b,x0,x1,x2,x3,*t,*tp;

777:   VecGetArray(bb,&b);
778:   VecGetArray(xx,&x);
779:   t  = a->solve_work;
780:   ISGetIndices(isrow,&r);

782:   /* solve U^T * D * y = b by forward substitution */
783:   tp = t;
784:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
785:     idx   = 4*r[k];
786:     tp[0] = b[idx];
787:     tp[1] = b[idx+1];
788:     tp[2] = b[idx+2];
789:     tp[3] = b[idx+3];
790:     tp += 4;
791:   }
792: 
793:   for (k=0; k<mbs; k++){
794:     v  = aa + 16*ai[k];
795:     vj = aj + ai[k];
796:     tp = t + k*4;
797:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
798:     nz = ai[k+1] - ai[k];

800:     tp = t + (*vj)*4;
801:     while (nz--) {
802:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
803:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
804:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
805:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
806:       vj++; tp = t + (*vj)*4;
807:       v += 16;
808:     }

810:     /* xk = inv(Dk)*(Dk*xk) */
811:     diag  = aa+k*16;          /* ptr to inv(Dk) */
812:     tp    = t + k*4;
813:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
814:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
815:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
816:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
817:   }

819:   /* solve U*x = y by back substitution */
820:   for (k=mbs-1; k>=0; k--){
821:     v  = aa + 16*ai[k];
822:     vj = aj + ai[k];
823:     tp    = t + k*4;
824:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
825:     nz = ai[k+1] - ai[k];
826: 
827:     tp = t + (*vj)*4;
828:     while (nz--) {
829:       /* xk += U(k,:)*x(:) */
830:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
831:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
832:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
833:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
834:       vj++; tp = t + (*vj)*4;
835:       v += 16;
836:     }
837:     tp    = t + k*4;
838:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
839:     idx        = 4*r[k];
840:     x[idx]     = x0;
841:     x[idx+1]   = x1;
842:     x[idx+2]   = x2;
843:     x[idx+3]   = x3;
844:   }

846:   VecRestoreArray(bb,&b);
847:   VecRestoreArray(xx,&x);
848:   PetscLogFlops(16*(2*a->s_nz + mbs));
849:   return(0);
850: }

852: /*
853:    Special case where the matrix was factored in the natural ordering. 
854:    This eliminates the need for the column and row permutation.
855: */
858: int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
859: {
860:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
861:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
862:   MatScalar       *aa=a->a,*v,*diag;
863:   PetscScalar     *x,*xp,*b,x0,x1,x2,x3;
864:   int             nz,*vj,k,ierr;

867: 
868:   VecGetArray(bb,&b);
869:   VecGetArray(xx,&x);

871:   /* solve U^T * D * y = b by forward substitution */
872:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
873:   for (k=0; k<mbs; k++){
874:     v  = aa + 16*ai[k];
875:     xp = x + k*4;
876:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
877:     nz = ai[k+1] - ai[k];
878:     vj = aj + ai[k];
879:     xp = x + (*vj)*4;
880:     while (nz--) {
881:       /* x(:) += U(k,:)^T*(Dk*xk) */
882:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
883:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
884:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
885:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
886:       vj++; xp = x + (*vj)*4;
887:       v += 16;
888:     }
889:     /* xk = inv(Dk)*(Dk*xk) */
890:     diag = aa+k*16;          /* ptr to inv(Dk) */
891:     xp   = x + k*4;
892:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
893:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
894:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
895:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
896:   }

898:   /* solve U*x = y by back substitution */
899:   for (k=mbs-1; k>=0; k--){
900:     v  = aa + 16*ai[k];
901:     xp = x + k*4;
902:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
903:     nz = ai[k+1] - ai[k];
904:     vj = aj + ai[k];
905:     xp = x + (*vj)*4;
906:     while (nz--) {
907:       /* xk += U(k,:)*x(:) */
908:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
909:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
910:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
911:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
912:       vj++;
913:       v += 16; xp = x + (*vj)*4;
914:     }
915:     xp = x + k*4;
916:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
917:   }

919:   VecRestoreArray(bb,&b);
920:   VecRestoreArray(xx,&x);
921:   PetscLogFlops(16*(2*a->s_nz + mbs));
922:   return(0);
923: }

927: int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
928: {
929:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
930:   IS              isrow=a->row;
931:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
932:   int             nz,*vj,k,ierr,*r,idx;
933:   MatScalar       *aa=a->a,*v,*diag;
934:   PetscScalar     *x,*b,x0,x1,x2,*t,*tp;

937:   VecGetArray(bb,&b);
938:   VecGetArray(xx,&x);
939:   t  = a->solve_work;
940:   ISGetIndices(isrow,&r);

942:   /* solve U^T * D * y = b by forward substitution */
943:   tp = t;
944:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
945:     idx   = 3*r[k];
946:     tp[0] = b[idx];
947:     tp[1] = b[idx+1];
948:     tp[2] = b[idx+2];
949:     tp += 3;
950:   }
951: 
952:   for (k=0; k<mbs; k++){
953:     v  = aa + 9*ai[k];
954:     vj = aj + ai[k];
955:     tp = t + k*3;
956:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
957:     nz = ai[k+1] - ai[k];

959:     tp = t + (*vj)*3;
960:     while (nz--) {
961:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
962:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
963:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
964:       vj++; tp = t + (*vj)*3;
965:       v += 9;
966:     }

968:     /* xk = inv(Dk)*(Dk*xk) */
969:     diag  = aa+k*9;          /* ptr to inv(Dk) */
970:     tp    = t + k*3;
971:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
972:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
973:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
974:   }

976:   /* solve U*x = y by back substitution */
977:   for (k=mbs-1; k>=0; k--){
978:     v  = aa + 9*ai[k];
979:     vj = aj + ai[k];
980:     tp    = t + k*3;
981:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
982:     nz = ai[k+1] - ai[k];
983: 
984:     tp = t + (*vj)*3;
985:     while (nz--) {
986:       /* xk += U(k,:)*x(:) */
987:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
988:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
989:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
990:       vj++; tp = t + (*vj)*3;
991:       v += 9;
992:     }
993:     tp    = t + k*3;
994:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
995:     idx      = 3*r[k];
996:     x[idx]   = x0;
997:     x[idx+1] = x1;
998:     x[idx+2] = x2;
999:   }

1001:   VecRestoreArray(bb,&b);
1002:   VecRestoreArray(xx,&x);
1003:   PetscLogFlops(9*(2*a->s_nz + mbs));
1004:   return(0);
1005: }

1007: /*
1008:    Special case where the matrix was factored in the natural ordering. 
1009:    This eliminates the need for the column and row permutation.
1010: */
1013: int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1014: {
1015:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1016:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1017:   MatScalar       *aa=a->a,*v,*diag;
1018:   PetscScalar     *x,*xp,*b,x0,x1,x2;
1019:   int             nz,*vj,k,ierr;

1022: 
1023:   VecGetArray(bb,&b);
1024:   VecGetArray(xx,&x);

1026:   /* solve U^T * D * y = b by forward substitution */
1027:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1028:   for (k=0; k<mbs; k++){
1029:     v  = aa + 9*ai[k];
1030:     xp = x + k*3;
1031:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1032:     nz = ai[k+1] - ai[k];
1033:     vj = aj + ai[k];
1034:     xp = x + (*vj)*3;
1035:     while (nz--) {
1036:       /* x(:) += U(k,:)^T*(Dk*xk) */
1037:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1038:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1039:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1040:       vj++; xp = x + (*vj)*3;
1041:       v += 9;
1042:     }
1043:     /* xk = inv(Dk)*(Dk*xk) */
1044:     diag = aa+k*9;          /* ptr to inv(Dk) */
1045:     xp   = x + k*3;
1046:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1047:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1048:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1049:   }

1051:   /* solve U*x = y by back substitution */
1052:   for (k=mbs-1; k>=0; k--){
1053:     v  = aa + 9*ai[k];
1054:     xp = x + k*3;
1055:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1056:     nz = ai[k+1] - ai[k];
1057:     vj = aj + ai[k];
1058:     xp = x + (*vj)*3;
1059:     while (nz--) {
1060:       /* xk += U(k,:)*x(:) */
1061:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1062:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1063:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1064:       vj++;
1065:       v += 9; xp = x + (*vj)*3;
1066:     }
1067:     xp = x + k*3;
1068:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1069:   }

1071:   VecRestoreArray(bb,&b);
1072:   VecRestoreArray(xx,&x);
1073:   PetscLogFlops(9*(2*a->s_nz + mbs));
1074:   return(0);
1075: }

1079: int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1080: {
1081:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ *)A->data;
1082:   IS              isrow=a->row;
1083:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1084:   int             nz,*vj,k,k2,ierr,*r,idx;
1085:   MatScalar       *aa=a->a,*v,*diag;
1086:   PetscScalar     *x,*b,x0,x1,*t;

1089:   VecGetArray(bb,&b);
1090:   VecGetArray(xx,&x);
1091:   t  = a->solve_work;
1092:   /* printf("called MatSolve_SeqSBAIJ_2\n"); */
1093:   ISGetIndices(isrow,&r);

1095:   /* solve U^T * D * y = perm(b) by forward substitution */
1096:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1097:     idx = 2*r[k];
1098:     t[k*2]   = b[idx];
1099:     t[k*2+1] = b[idx+1];
1100:   }
1101:   for (k=0; k<mbs; k++){
1102:     v  = aa + 4*ai[k];
1103:     vj = aj + ai[k];
1104:     k2 = k*2;
1105:     x0 = t[k2]; x1 = t[k2+1];
1106:     nz = ai[k+1] - ai[k];
1107:     while (nz--) {
1108:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1109:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1110:       vj++; v += 4;
1111:     }
1112:     diag = aa+k*4;  /* ptr to inv(Dk) */
1113:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1114:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1115:   }

1117:   /* solve U*x = y by back substitution */
1118:   for (k=mbs-1; k>=0; k--){
1119:     v  = aa + 4*ai[k];
1120:     vj = aj + ai[k];
1121:     k2 = k*2;
1122:     x0 = t[k2]; x1 = t[k2+1];
1123:     nz = ai[k+1] - ai[k];
1124:     while (nz--) {
1125:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1126:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1127:       vj++; v += 4;
1128:     }
1129:     t[k2]    = x0;
1130:     t[k2+1]  = x1;
1131:     idx      = 2*r[k];
1132:     x[idx]   = x0;
1133:     x[idx+1] = x1;
1134:   }

1136:   ISRestoreIndices(isrow,&r);
1137:   VecRestoreArray(bb,&b);
1138:   VecRestoreArray(xx,&x);
1139:   PetscLogFlops(4*(2*a->s_nz + mbs));
1140:   return(0);
1141: }

1143: /*
1144:    Special case where the matrix was factored in the natural ordering. 
1145:    This eliminates the need for the column and row permutation.
1146: */
1149: int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1150: {
1151:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1152:   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1153:   MatScalar       *aa=a->a,*v,*diag;
1154:   PetscScalar     *x,*b,x0,x1;
1155:   int             nz,*vj,k,k2,ierr;

1158: 
1159:   VecGetArray(bb,&b);
1160:   VecGetArray(xx,&x);

1162:   /* solve U^T * D * y = b by forward substitution */
1163:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1164:   for (k=0; k<mbs; k++){
1165:     v  = aa + 4*ai[k];
1166:     vj = aj + ai[k];
1167:     k2 = k*2;
1168:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1169:     nz = ai[k+1] - ai[k];
1170: 
1171:     while (nz--) {
1172:       /* x(:) += U(k,:)^T*(Dk*xk) */
1173:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1174:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1175:       vj++; v += 4;
1176:     }
1177:     /* xk = inv(Dk)*(Dk*xk) */
1178:     diag = aa+k*4;          /* ptr to inv(Dk) */
1179:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1180:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1181:   }

1183:   /* solve U*x = y by back substitution */
1184:   for (k=mbs-1; k>=0; k--){
1185:     v  = aa + 4*ai[k];
1186:     vj = aj + ai[k];
1187:     k2 = k*2;
1188:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1189:     nz = ai[k+1] - ai[k];
1190:     while (nz--) {
1191:       /* xk += U(k,:)*x(:) */
1192:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1193:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1194:       vj++; v += 4;
1195:     }
1196:     x[k2]     = x0;
1197:     x[k2+1]   = x1;
1198:   }

1200:   VecRestoreArray(bb,&b);
1201:   VecRestoreArray(xx,&x);
1202:   PetscLogFlops(4*(2*a->s_nz + mbs)); /* bs2*(2*a->s_nz + mbs) */
1203:   return(0);
1204: }

1208: int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1209: {
1210:   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
1211:   IS              isrow=a->row;
1212:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1213:   MatScalar       *aa=a->a,*v;
1214:   PetscScalar     *x,*b,xk,*t;
1215:   int             nz,*vj,k;

1218:   if (!mbs) return(0);

1220:   VecGetArray(bb,&b);
1221:   VecGetArray(xx,&x);
1222:   t    = a->solve_work;

1224:   ISGetIndices(isrow,&rip);
1225: 
1226:   /* solve U^T*D*y = perm(b) by forward substitution */
1227:   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1228:   for (k=0; k<mbs; k++){
1229:     v  = aa + ai[k];
1230:     vj = aj + ai[k];
1231:     xk = t[k];
1232:     nz = ai[k+1] - ai[k];
1233:     while (nz--) t[*vj++] += (*v++) * xk;
1234:     t[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1235:   }

1237:   /* solve U*x = y by back substitution */
1238:   for (k=mbs-1; k>=0; k--){
1239:     v  = aa + ai[k];
1240:     vj = aj + ai[k];
1241:     xk = t[k];
1242:     nz = ai[k+1] - ai[k];
1243:     while (nz--) xk += (*v++) * t[*vj++];
1244:     t[k]      = xk;
1245:     x[rip[k]] = xk;
1246:   }

1248:   ISRestoreIndices(isrow,&rip);
1249:   VecRestoreArray(bb,&b);
1250:   VecRestoreArray(xx,&x);
1251:   PetscLogFlops(4*a->s_nz + A->m);
1252:   return(0);
1253: }

1257: int MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1258: {
1259:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1260:   int          ierr;

1263:   if (a->bs == 1) {
1264:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1265:   } else {
1266:     IS              isrow=a->row;
1267:     int             mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1268:     MatScalar       *aa=a->a,*v;
1269:     PetscScalar     *x,*b,*t;
1270:     int             nz,*vj,k,n;
1271:     if (bb->n > a->solves_work_n) {
1272:       if (a->solves_work) {PetscFree(a->solves_work);}
1273:       PetscMalloc(bb->n*A->m*sizeof(PetscScalar),&a->solves_work);
1274:       a->solves_work_n = bb->n;
1275:     }
1276:     n    = bb->n;
1277:     VecGetArray(bb->v,&b);
1278:     VecGetArray(xx->v,&x);
1279:     t    = a->solves_work;

1281:     ISGetIndices(isrow,&rip);
1282: 
1283:     /* solve U^T*D*y = perm(b) by forward substitution */
1284:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1285:     for (k=0; k<mbs; k++){
1286:       v  = aa + ai[k];
1287:       vj = aj + ai[k];
1288:       nz = ai[k+1] - ai[k];
1289:       while (nz--) {
1290:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1291:         v++;vj++;
1292:       }
1293:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1294:     }
1295: 
1296:     /* solve U*x = y by back substitution */
1297:     for (k=mbs-1; k>=0; k--){
1298:       v  = aa + ai[k];
1299:       vj = aj + ai[k];
1300:       nz = ai[k+1] - ai[k];
1301:       while (nz--) {
1302:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1303:         v++;vj++;
1304:       }
1305:       for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1306:     }

1308:     ISRestoreIndices(isrow,&rip);
1309:     VecRestoreArray(bb->v,&b);
1310:     VecRestoreArray(xx->v,&x);
1311:     PetscLogFlops(bb->n*(4*a->s_nz + A->m));
1312:   }
1313:   return(0);
1314: }

1316: /*
1317:       Special case where the matrix was ILU(0) factored in the natural
1318:    ordering. This eliminates the need for the column and row permutation.
1319: */
1322: int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1323: {
1324:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1325:   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1326:   MatScalar       *aa=a->a,*v;
1327:   PetscScalar     *x,*b,xk;
1328:   int             nz,*vj,k;

1331:   VecGetArray(bb,&b);
1332:   VecGetArray(xx,&x);
1333: 
1334:   /* solve U^T*D*y = b by forward substitution */
1335:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1336:   for (k=0; k<mbs; k++){
1337:     v  = aa + ai[k] + 1;
1338:     vj = aj + ai[k] + 1;
1339:     xk = x[k];
1340:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1341:     while (nz--) x[*vj++] += (*v++) * xk;
1342:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1343:   }

1345:   /* solve U*x = y by back substitution */
1346:   for (k=mbs-2; k>=0; k--){
1347:     v  = aa + ai[k] + 1;
1348:     vj = aj + ai[k] + 1;
1349:     xk = x[k];
1350:     nz = ai[k+1] - ai[k] - 1;
1351:     while (nz--) xk += (*v++) * x[*vj++];
1352:     x[k] = xk;
1353:   }

1355:   VecRestoreArray(bb,&b);
1356:   VecRestoreArray(xx,&x);
1357:   PetscLogFlops(4*a->s_nz + A->m);
1358:   return(0);
1359: }

1361: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1364: int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1365: {
1366:   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1367:   int         *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1368:   int         *jutmp,bs = a->bs,bs2=a->bs2;
1369:   int         m,realloc = 0,*levtmp;
1370:   int         *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd,*jl;
1371:   int         incrlev,*lev,shift,prow,nz;
1372:   int         *il,ili,nextprow;
1373:   PetscReal   f = info->fill,levels = info->levels;
1374:   PetscTruth  perm_identity;

1377:   /* check whether perm is the identity mapping */
1378:   ISIdentity(perm,&perm_identity);

1380:   /* special case that simply copies fill pattern */
1381:   if (!levels && perm_identity && bs==1) {
1382:     MatDuplicate_SeqSBAIJ(A,MAT_DO_NOT_COPY_VALUES,B);
1383:     (*B)->factor    = FACTOR_CHOLESKY;
1384:     b               = (Mat_SeqSBAIJ*)(*B)->data;
1385:     b->row          = perm;
1386:     b->icol         = perm;
1387:     b->factor_damping   = info->damping;
1388:     b->factor_shift     = info->shift;
1389:     b->factor_zeropivot = info->zeropivot;
1390:     PetscObjectReference((PetscObject)perm);
1391:     PetscObjectReference((PetscObject)perm);
1392:     PetscMalloc(((*B)->m+1)*sizeof(PetscScalar),&b->solve_work);
1393:     (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1394:     (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1395:     return(0);
1396:   }

1398:   /* --- inplace icc(levels), levels>0, ie, *B has same data structure as A --- */
1399:   if (levels > 0 && perm_identity && bs==1 ){
1400:     if (!perm_identity) a->permute = PETSC_TRUE;
1401: 
1402:   ISGetIndices(perm,&rip);
1403: 
1404:   if (perm_identity){ /* without permutation */
1405:     ai = a->i; aj = a->j;
1406:   } else {            /* non-trivial permutation */
1407:     MatReorderingSeqSBAIJ(A,perm);
1408:     ai = a->inew; aj = a->jnew;
1409:   }
1410: 
1411:   /* initialization */
1412:   PetscMalloc((mbs+1)*sizeof(int),&iu);
1413:   umax  = (int)(f*ai[mbs] + 1);
1414:   PetscMalloc(umax*sizeof(int),&lev);
1415:   PetscMalloc(umax*sizeof(int),&ju);
1416:   iu[0] = 0;
1417:   juidx = 0; /* index for ju */
1418:   PetscMalloc((4*mbs+1)*sizeof(int),&jl); /* linked list for getting pivot row */
1419:   q      = jl + mbs;   /* linked list for col index of active row */
1420:   levtmp = q + mbs;
1421:   il     = levtmp + mbs;
1422:   for (i=0; i<mbs; i++){
1423:     jl[i] = mbs;
1424:     q[i]  = 0;
1425:     il[i] = 0;
1426:   }

1428:   /* for each row k */
1429:   for (k=0; k<mbs; k++){
1430:     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
1431:     q[k] = mbs;
1432:     /* initialize nonzero structure of k-th row to row rip[k] of A */
1433:     jmin = ai[rip[k]] +1; /* exclude diag[k] */
1434:     jmax = ai[rip[k]+1];
1435:     for (j=jmin; j<jmax; j++){
1436:       vj = rip[aj[j]]; /* col. value */
1437:       if(vj > k){
1438:         qm = k;
1439:         do {
1440:           m  = qm; qm = q[m];
1441:         } while(qm < vj);
1442:         if (qm == vj) {
1443:           SETERRQ(1," error: duplicate entry in A\n");
1444:         }
1445:         nzk++;
1446:         q[m]  = vj;
1447:         q[vj] = qm;
1448:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1449:       } /* if(vj > k) */
1450:     } /* for (j=jmin; j<jmax; j++) */

1452:     /* modify nonzero structure of k-th row by computing fill-in
1453:        for each row i to be merged in */
1454:     prow = k;
1455:     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
1456: 
1457:     while (prow < k){
1458:       nextprow = jl[prow];
1459:       /* merge row prow into k-th row */
1460:       ili  = il[prow];
1461:       jmin = ili + 1;  /* points to 2nd nzero entry in U(prow,k:mbs-1) */
1462:       jmax = iu[prow+1];
1463:       qm   = k;
1464:       for (j=jmin; j<jmax; j++){
1465:         vj = ju[j];
1466:         incrlev = lev[j] + 1;
1467:         if (incrlev > levels) continue;
1468:         do {
1469:           m = qm; qm = q[m];
1470:         } while (qm < vj);
1471:         if (qm != vj){  /* a fill */
1472:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1473:           levtmp[vj]  = incrlev;
1474:         } else {
1475:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1476:         }
1477:       }
1478:       if (jmin < jmax){ /* update il and jl */
1479:         il[prow] = jmin;
1480:         j = ju[jmin];
1481:         jl[prow] = jl[j]; jl[j] = prow;
1482:       }
1483:       prow = nextprow;
1484:     }
1485: 
1486:     /* add the first nonzero element in U(k, k+1:mbs-1) to jl */
1487:     if (nzk > 0){
1488:       i = q[k]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1489:       jl[k] = jl[i]; jl[i] = k;
1490:       il[k] = iu[k] + 1;
1491:     }
1492:     iu[k+1] = iu[k] + nzk + 1;  /* include diag[k] */

1494:     /* allocate more space to ju if needed */
1495:     if (iu[k+1] > umax) {
1496:       /* estimate how much additional space we will need */
1497:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1498:       /* just double the memory each time */
1499:       maxadd = umax;
1500:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1501:       umax += maxadd;

1503:       /* allocate a longer ju */
1504:       PetscMalloc(umax*sizeof(int),&jutmp);
1505:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1506:       PetscFree(ju);
1507:       ju   = jutmp;

1509:       PetscMalloc(umax*sizeof(int),&jutmp);
1510:       PetscMemcpy(jutmp,lev,(iu[k])*sizeof(int));
1511:       PetscFree(lev);
1512:       lev      = jutmp;
1513:       realloc += 2; /* count how many times we realloc */
1514:     }

1516:     /* save nonzero structure of k-th row in ju */
1517:     ju[juidx]  = k; /* diag[k] */
1518:     lev[juidx] = 0;
1519:     juidx++;
1520:     i = k;
1521:     while (nzk --) {
1522:       i           = q[i];
1523:       ju[juidx] = i;
1524:       lev[juidx] = levtmp[i];
1525:       juidx++;
1526:     }
1527:   } /* end of for (k=0; k<mbs; k++) */

1529:   if (ai[mbs] != 0) {
1530:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1531:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1532:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Run with -pc_cholesky_fill %g or use \n",af);
1533:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:PCCholeskySetFill(pc,%g);\n",af);
1534:     PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:for best performance.\n");
1535:   } else {
1536:      PetscLogInfo(A,"MatCholeskyFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1537:   }

1539:   ISRestoreIndices(perm,&rip);
1540:   PetscFree(jl);
1541:   PetscFree(lev);

1543:   /* put together the new matrix */
1544:   MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
1545:   /* PetscLogObjectParent(*B,iperm); */
1546:   b = (Mat_SeqSBAIJ*)(*B)->data;
1547:   PetscFree(b->imax);
1548:   b->singlemalloc = PETSC_FALSE;
1549:   /* the next line frees the default space generated by the Create() */
1550:   PetscFree(b->a);
1551:   PetscFree(b->ilen);
1552:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1553:   b->j    = ju;
1554:   b->i    = iu;
1555:   b->diag = 0;
1556:   b->ilen = 0;
1557:   b->imax = 0;
1558:   b->row  = perm;
1559:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1560:   PetscObjectReference((PetscObject)perm);
1561:   b->icol = perm;
1562:   PetscObjectReference((PetscObject)perm);
1563:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1564:   /* In b structure:  Free imax, ilen, old a, old j.  
1565:      Allocate idnew, solve_work, new a, new j */
1566:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1567:   b->s_maxnz          = b->s_nz = iu[mbs];
1568:   b->factor_damping   = info->damping;
1569:   b->factor_shift     = info->shift;
1570:   b->factor_zeropivot = info->zeropivot;

1572:   (*B)->factor                 = FACTOR_CHOLESKY;
1573:   (*B)->info.factor_mallocs    = realloc;
1574:   (*B)->info.fill_ratio_given  = f;
1575:   if (ai[mbs] != 0) {
1576:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1577:   } else {
1578:     (*B)->info.fill_ratio_needed = 0.0;
1579:   }


1582:   (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1583:   (*B)->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1584:   PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1585: 
1586:   return(0);
1587:   } /* end of if (levels > 0 && perm_identity && bs==1 ) */

1589:   if (!perm_identity) a->permute = PETSC_TRUE;
1590:   if (perm_identity){
1591:     ai = a->i; aj = a->j;
1592:   } else { /*  non-trivial permutation */
1593:     MatReorderingSeqSBAIJ(A, perm);
1594:     ai = a->inew; aj = a->jnew;
1595:   }
1596: 
1597:   /* initialization */
1598:   ISGetIndices(perm,&rip);
1599:   umax  = (int)(f*ai[mbs] + 1);
1600:   PetscMalloc(umax*sizeof(int),&lev);
1601:   umax += mbs + 1;
1602:   shift = mbs + 1;
1603:   PetscMalloc((mbs+1)*sizeof(int),&iu);
1604:   PetscMalloc(umax*sizeof(int),&ju);
1605:   iu[0] = mbs + 1;
1606:   juidx = mbs + 1;
1607:   /* prowl: linked list for pivot row */
1608:   PetscMalloc((3*mbs+1)*sizeof(int),&prowl);
1609:   /* q: linked list for col index */
1610:   q       = prowl + mbs;
1611:   levtmp  = q     + mbs;
1612: 
1613:   for (i=0; i<mbs; i++){
1614:     prowl[i] = mbs;
1615:     q[i] = 0;
1616:   }

1618:   /* for each row k */
1619:   for (k=0; k<mbs; k++){
1620:     nzk  = 0;
1621:     q[k] = mbs;
1622:     /* copy current row into linked list */
1623:     nz = ai[rip[k]+1] - ai[rip[k]];
1624:     j = ai[rip[k]];
1625:     while (nz--){
1626:       vj = rip[aj[j++]];
1627:       if (vj > k){
1628:         qm = k;
1629:         do {
1630:           m  = qm; qm = q[m];
1631:         } while(qm < vj);
1632:         if (qm == vj) {
1633:           SETERRQ(1," error: duplicate entry in A\n");
1634:         }
1635:         nzk++;
1636:         q[m]       = vj;
1637:         q[vj]      = qm;
1638:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1639:       }
1640:     }

1642:     /* modify nonzero structure of k-th row by computing fill-in
1643:        for each row prow to be merged in */
1644:     prow = k;
1645:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1646: 
1647:     while (prow < k){
1648:       /* merge row prow into k-th row */
1649:       jmin = iu[prow] + 1;
1650:       jmax = iu[prow+1];
1651:       qm = k;
1652:       for (j=jmin; j<jmax; j++){
1653:         incrlev = lev[j-shift] + 1;
1654:         if (incrlev > levels) continue;

1656:         vj      = ju[j];
1657:         do {
1658:           m = qm; qm = q[m];
1659:         } while (qm < vj);
1660:         if (qm != vj){      /* a fill */
1661:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1662:           levtmp[vj] = incrlev;
1663:         } else {
1664:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1665:         }
1666:       }
1667:       prow = prowl[prow]; /* next pivot row */
1668:     }
1669: 
1670:     /* add k to row list for first nonzero element in k-th row */
1671:     if (nzk > 1){
1672:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1673:       prowl[k] = prowl[i]; prowl[i] = k;
1674:     }
1675:     iu[k+1] = iu[k] + nzk;

1677:     /* allocate more space to ju and lev if needed */
1678:     if (iu[k+1] > umax) {
1679:       /* estimate how much additional space we will need */
1680:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1681:       /* just double the memory each time */
1682:       maxadd = umax;
1683:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1684:       umax += maxadd;

1686:       /* allocate a longer ju */
1687:       PetscMalloc(umax*sizeof(int),&jutmp);
1688:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));
1689:       PetscFree(ju);
1690:       ju       = jutmp;

1692:       PetscMalloc(umax*sizeof(int),&jutmp);
1693:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));
1694:       PetscFree(lev);
1695:       lev      = jutmp;
1696:       realloc += 2; /* count how many times we realloc */
1697:     }

1699:     /* save nonzero structure of k-th row in ju */
1700:     i=k;
1701:     while (nzk --) {
1702:       i                = q[i];
1703:       ju[juidx]        = i;
1704:       lev[juidx-shift] = levtmp[i];
1705:       juidx++;
1706:     }
1707:   }
1708: 
1709:   if (ai[mbs] != 0) {
1710:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1711:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1712:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1713:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1714:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1715:   } else {
1716:     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1717:   }

1719:   ISRestoreIndices(perm,&rip);
1720:   PetscFree(prowl);
1721:   PetscFree(lev);

1723:   /* put together the new matrix */
1724:   MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);
1725:   /* PetscLogObjectParent(*B,iperm); */
1726:   b    = (Mat_SeqSBAIJ*)(*B)->data;
1727:   PetscFree(b->imax);
1728:   b->singlemalloc = PETSC_FALSE;
1729:   /* the next line frees the default space generated by the Create() */
1730:   PetscFree(b->a);
1731:   PetscFree(b->ilen);
1732:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1733:   b->j    = ju;
1734:   b->i    = iu;
1735:   b->diag = 0;
1736:   b->ilen = 0;
1737:   b->imax = 0;
1738: 
1739:   if (b->row) {
1740:     ISDestroy(b->row);
1741:   }
1742:   if (b->icol) {
1743:     ISDestroy(b->icol);
1744:   }
1745:   b->row  = perm;
1746:   b->icol = perm;
1747:   PetscObjectReference((PetscObject)perm);
1748:   PetscObjectReference((PetscObject)perm);
1749:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1750:   /* In b structure:  Free imax, ilen, old a, old j.  
1751:      Allocate idnew, solve_work, new a, new j */
1752:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1753:   b->s_maxnz = b->s_nz = iu[mbs];
1754: 
1755:   (*B)->factor                 = FACTOR_CHOLESKY;
1756:   (*B)->info.factor_mallocs    = realloc;
1757:   (*B)->info.fill_ratio_given  = f;
1758:   if (ai[mbs] != 0) {
1759:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1760:   } else {
1761:     (*B)->info.fill_ratio_needed = 0.0;
1762:   }

1764:   if (perm_identity){
1765:     switch (bs) {
1766:       case 1:
1767:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1768:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1769:         (*B)->ops->solves                = MatSolves_SeqSBAIJ_1;
1770:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJl:Using special in-place natural ordering factor and solve BS=1\n");
1771:         break;
1772:       case 2:
1773:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1774:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1775:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1776:         break;
1777:       case 3:
1778:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1779:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1780:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1781:         break;
1782:       case 4:
1783:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1784:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1785:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1786:         break;
1787:       case 5:
1788:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1789:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1790:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1791:         break;
1792:       case 6:
1793:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1794:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1795:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1796:         break;
1797:       case 7:
1798:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1799:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1800:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1801:       break;
1802:       default:
1803:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1804:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1805:         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1806:       break;
1807:     }
1808:   }

1810:   return(0);
1811: }