Actual source code: baijfact2.c

  1: /*$Id: baijfact2.c,v 1.52 2001/04/13 20:48:17 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/vec/vecimpl.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 11: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 12: {
 13:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 14:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 15:   int             *diag = a->diag;
 16:   MatScalar       *aa=a->a,*v;
 17:   Scalar          s1,*x,*b;

 20:   VecGetArray(bb,&b);
 21:   VecGetArray(xx,&x);

 23:   /* forward solve the U^T */
 24:   for (i=0; i<n; i++) {

 26:     v     = aa + diag[i];
 27:     /* multiply by the inverse of the block diagonal */
 28:     s1    = (*v++)*b[i];
 29:     vi    = aj + diag[i] + 1;
 30:     nz    = ai[i+1] - diag[i] - 1;
 31:     while (nz--) {
 32:       x[*vi++]  -= (*v++)*s1;
 33:     }
 34:     x[i]   = s1;
 35:   }
 36:   /* backward solve the L^T */
 37:   for (i=n-1; i>=0; i--){
 38:     v    = aa + diag[i] - 1;
 39:     vi   = aj + diag[i] - 1;
 40:     nz   = diag[i] - ai[i];
 41:     s1   = x[i];
 42:     while (nz--) {
 43:       x[*vi--]   -=  (*v--)*s1;
 44:     }
 45:   }
 46:   VecRestoreArray(bb,&b);
 47:   VecRestoreArray(xx,&x);
 48:   PetscLogFlops(2*(a->nz) - A->n);
 49:   return(0);
 50: }

 52: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 53: {
 54:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 55:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 56:   int             *diag = a->diag,oidx;
 57:   MatScalar       *aa=a->a,*v;
 58:   Scalar          s1,s2,x1,x2;
 59:   Scalar          *x,*b;

 62:   VecGetArray(bb,&b);
 63:   VecGetArray(xx,&x);

 65:   /* forward solve the U^T */
 66:   idx = 0;
 67:   for (i=0; i<n; i++) {

 69:     v     = aa + 4*diag[i];
 70:     /* multiply by the inverse of the block diagonal */
 71:     x1 = b[idx];   x2 = b[1+idx];
 72:     s1 = v[0]*x1  +  v[1]*x2;
 73:     s2 = v[2]*x1  +  v[3]*x2;
 74:     v += 4;

 76:     vi    = aj + diag[i] + 1;
 77:     nz    = ai[i+1] - diag[i] - 1;
 78:     while (nz--) {
 79:       oidx = 2*(*vi++);
 80:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 81:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 82:       v  += 4;
 83:     }
 84:     x[idx]   = s1;x[1+idx] = s2;
 85:     idx += 2;
 86:   }
 87:   /* backward solve the L^T */
 88:   for (i=n-1; i>=0; i--){
 89:     v    = aa + 4*diag[i] - 4;
 90:     vi   = aj + diag[i] - 1;
 91:     nz   = diag[i] - ai[i];
 92:     idt  = 2*i;
 93:     s1   = x[idt];  s2 = x[1+idt];
 94:     while (nz--) {
 95:       idx   = 2*(*vi--);
 96:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
 97:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
 98:       v -= 4;
 99:     }
100:   }
101:   VecRestoreArray(bb,&b);
102:   VecRestoreArray(xx,&x);
103:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
104:   return(0);
105: }

107: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
108: {
109:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
110:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
111:   int             *diag = a->diag,oidx;
112:   MatScalar       *aa=a->a,*v;
113:   Scalar          s1,s2,s3,x1,x2,x3;
114:   Scalar          *x,*b;

117:   VecGetArray(bb,&b);
118:   VecGetArray(xx,&x);

120:   /* forward solve the U^T */
121:   idx = 0;
122:   for (i=0; i<n; i++) {

124:     v     = aa + 9*diag[i];
125:     /* multiply by the inverse of the block diagonal */
126:     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
127:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
128:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
129:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
130:     v += 9;

132:     vi    = aj + diag[i] + 1;
133:     nz    = ai[i+1] - diag[i] - 1;
134:     while (nz--) {
135:       oidx = 3*(*vi++);
136:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
137:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
138:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
139:       v  += 9;
140:     }
141:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
142:     idx += 3;
143:   }
144:   /* backward solve the L^T */
145:   for (i=n-1; i>=0; i--){
146:     v    = aa + 9*diag[i] - 9;
147:     vi   = aj + diag[i] - 1;
148:     nz   = diag[i] - ai[i];
149:     idt  = 3*i;
150:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
151:     while (nz--) {
152:       idx   = 3*(*vi--);
153:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
154:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
155:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
156:       v -= 9;
157:     }
158:   }
159:   VecRestoreArray(bb,&b);
160:   VecRestoreArray(xx,&x);
161:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
162:   return(0);
163: }

165: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
166: {
167:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
168:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
169:   int             *diag = a->diag,oidx;
170:   MatScalar       *aa=a->a,*v;
171:   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
172:   Scalar          *x,*b;

175:   VecGetArray(bb,&b);
176:   VecGetArray(xx,&x);

178:   /* forward solve the U^T */
179:   idx = 0;
180:   for (i=0; i<n; i++) {

182:     v     = aa + 16*diag[i];
183:     /* multiply by the inverse of the block diagonal */
184:     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
185:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
186:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
187:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
188:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
189:     v += 16;

191:     vi    = aj + diag[i] + 1;
192:     nz    = ai[i+1] - diag[i] - 1;
193:     while (nz--) {
194:       oidx = 4*(*vi++);
195:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
196:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
197:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
198:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
199:       v  += 16;
200:     }
201:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
202:     idx += 4;
203:   }
204:   /* backward solve the L^T */
205:   for (i=n-1; i>=0; i--){
206:     v    = aa + 16*diag[i] - 16;
207:     vi   = aj + diag[i] - 1;
208:     nz   = diag[i] - ai[i];
209:     idt  = 4*i;
210:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
211:     while (nz--) {
212:       idx   = 4*(*vi--);
213:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217:       v -= 16;
218:     }
219:   }
220:   VecRestoreArray(bb,&b);
221:   VecRestoreArray(xx,&x);
222:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
223:   return(0);
224: }

226: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
227: {
228:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
229:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
230:   int             *diag = a->diag,oidx;
231:   MatScalar       *aa=a->a,*v;
232:   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
233:   Scalar          *x,*b;

236:   VecGetArray(bb,&b);
237:   VecGetArray(xx,&x);

239:   /* forward solve the U^T */
240:   idx = 0;
241:   for (i=0; i<n; i++) {

243:     v     = aa + 25*diag[i];
244:     /* multiply by the inverse of the block diagonal */
245:     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
246:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
247:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
248:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
249:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
250:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
251:     v += 25;

253:     vi    = aj + diag[i] + 1;
254:     nz    = ai[i+1] - diag[i] - 1;
255:     while (nz--) {
256:       oidx = 5*(*vi++);
257:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
258:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
259:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
260:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
261:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
262:       v  += 25;
263:     }
264:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
265:     idx += 5;
266:   }
267:   /* backward solve the L^T */
268:   for (i=n-1; i>=0; i--){
269:     v    = aa + 25*diag[i] - 25;
270:     vi   = aj + diag[i] - 1;
271:     nz   = diag[i] - ai[i];
272:     idt  = 5*i;
273:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
274:     while (nz--) {
275:       idx   = 5*(*vi--);
276:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
277:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
278:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
279:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
280:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
281:       v -= 25;
282:     }
283:   }
284:   VecRestoreArray(bb,&b);
285:   VecRestoreArray(xx,&x);
286:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
287:   return(0);
288: }

290: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
291: {
292:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
293:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
294:   int             *diag = a->diag,oidx;
295:   MatScalar       *aa=a->a,*v;
296:   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
297:   Scalar          *x,*b;

300:   VecGetArray(bb,&b);
301:   VecGetArray(xx,&x);

303:   /* forward solve the U^T */
304:   idx = 0;
305:   for (i=0; i<n; i++) {

307:     v     = aa + 36*diag[i];
308:     /* multiply by the inverse of the block diagonal */
309:     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
310:     x6    = b[5+idx];
311:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
312:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
313:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
314:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
315:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
316:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
317:     v += 36;

319:     vi    = aj + diag[i] + 1;
320:     nz    = ai[i+1] - diag[i] - 1;
321:     while (nz--) {
322:       oidx = 6*(*vi++);
323:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
324:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
325:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
326:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
327:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
328:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
329:       v  += 36;
330:     }
331:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
332:     x[5+idx] = s6;
333:     idx += 6;
334:   }
335:   /* backward solve the L^T */
336:   for (i=n-1; i>=0; i--){
337:     v    = aa + 36*diag[i] - 36;
338:     vi   = aj + diag[i] - 1;
339:     nz   = diag[i] - ai[i];
340:     idt  = 6*i;
341:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
342:     s6 = x[5+idt];
343:     while (nz--) {
344:       idx   = 6*(*vi--);
345:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
346:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
347:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351:       v -= 36;
352:     }
353:   }
354:   VecRestoreArray(bb,&b);
355:   VecRestoreArray(xx,&x);
356:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
357:   return(0);
358: }

360: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
361: {
362:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
363:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
364:   int             *diag = a->diag,oidx;
365:   MatScalar       *aa=a->a,*v;
366:   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
367:   Scalar          *x,*b;

370:   VecGetArray(bb,&b);
371:   VecGetArray(xx,&x);

373:   /* forward solve the U^T */
374:   idx = 0;
375:   for (i=0; i<n; i++) {

377:     v     = aa + 49*diag[i];
378:     /* multiply by the inverse of the block diagonal */
379:     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
380:     x6    = b[5+idx]; x7 = b[6+idx];
381:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
382:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
383:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
384:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
385:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
386:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
387:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
388:     v += 49;

390:     vi    = aj + diag[i] + 1;
391:     nz    = ai[i+1] - diag[i] - 1;
392:     while (nz--) {
393:       oidx = 7*(*vi++);
394:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
395:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
396:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
397:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
398:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
399:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
400:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
401:       v  += 49;
402:     }
403:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
404:     x[5+idx] = s6;x[6+idx] = s7;
405:     idx += 7;
406:   }
407:   /* backward solve the L^T */
408:   for (i=n-1; i>=0; i--){
409:     v    = aa + 49*diag[i] - 49;
410:     vi   = aj + diag[i] - 1;
411:     nz   = diag[i] - ai[i];
412:     idt  = 7*i;
413:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
414:     s6 = x[5+idt];s7 = x[6+idt];
415:     while (nz--) {
416:       idx   = 7*(*vi--);
417:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
418:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
419:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
420:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
421:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
422:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
423:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
424:       v -= 49;
425:     }
426:   }
427:   VecRestoreArray(bb,&b);
428:   VecRestoreArray(xx,&x);
429:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
430:   return(0);
431: }

433: /*---------------------------------------------------------------------------------------------*/
434: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
435: {
436:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
437:   IS              iscol=a->col,isrow=a->row;
438:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
439:   int             *diag = a->diag;
440:   MatScalar       *aa=a->a,*v;
441:   Scalar          s1,*x,*b,*t;

444:   VecGetArray(bb,&b);
445:   VecGetArray(xx,&x);
446:   t  = a->solve_work;

448:   ISGetIndices(isrow,&rout); r = rout;
449:   ISGetIndices(iscol,&cout); c = cout;

451:   /* copy the b into temp work space according to permutation */
452:   for (i=0; i<n; i++) {
453:     t[i] = b[c[i]];
454:   }

456:   /* forward solve the U^T */
457:   for (i=0; i<n; i++) {

459:     v     = aa + diag[i];
460:     /* multiply by the inverse of the block diagonal */
461:     s1    = (*v++)*t[i];
462:     vi    = aj + diag[i] + 1;
463:     nz    = ai[i+1] - diag[i] - 1;
464:     while (nz--) {
465:       t[*vi++]  -= (*v++)*s1;
466:     }
467:     t[i]   = s1;
468:   }
469:   /* backward solve the L^T */
470:   for (i=n-1; i>=0; i--){
471:     v    = aa + diag[i] - 1;
472:     vi   = aj + diag[i] - 1;
473:     nz   = diag[i] - ai[i];
474:     s1   = t[i];
475:     while (nz--) {
476:       t[*vi--]   -=  (*v--)*s1;
477:     }
478:   }

480:   /* copy t into x according to permutation */
481:   for (i=0; i<n; i++) {
482:     x[r[i]]   = t[i];
483:   }

485:   ISRestoreIndices(isrow,&rout);
486:   ISRestoreIndices(iscol,&cout);
487:   VecRestoreArray(bb,&b);
488:   VecRestoreArray(xx,&x);
489:   PetscLogFlops(2*(a->nz) - A->n);
490:   return(0);
491: }

493: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
494: {
495:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
496:   IS              iscol=a->col,isrow=a->row;
497:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
498:   int             *diag = a->diag,ii,ic,ir,oidx;
499:   MatScalar       *aa=a->a,*v;
500:   Scalar          s1,s2,x1,x2;
501:   Scalar          *x,*b,*t;

504:   VecGetArray(bb,&b);
505:   VecGetArray(xx,&x);
506:   t  = a->solve_work;

508:   ISGetIndices(isrow,&rout); r = rout;
509:   ISGetIndices(iscol,&cout); c = cout;

511:   /* copy the b into temp work space according to permutation */
512:   ii = 0;
513:   for (i=0; i<n; i++) {
514:     ic      = 2*c[i];
515:     t[ii]   = b[ic];
516:     t[ii+1] = b[ic+1];
517:     ii += 2;
518:   }

520:   /* forward solve the U^T */
521:   idx = 0;
522:   for (i=0; i<n; i++) {

524:     v     = aa + 4*diag[i];
525:     /* multiply by the inverse of the block diagonal */
526:     x1    = t[idx];   x2 = t[1+idx];
527:     s1 = v[0]*x1  +  v[1]*x2;
528:     s2 = v[2]*x1  +  v[3]*x2;
529:     v += 4;

531:     vi    = aj + diag[i] + 1;
532:     nz    = ai[i+1] - diag[i] - 1;
533:     while (nz--) {
534:       oidx = 2*(*vi++);
535:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
536:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
537:       v  += 4;
538:     }
539:     t[idx]   = s1;t[1+idx] = s2;
540:     idx += 2;
541:   }
542:   /* backward solve the L^T */
543:   for (i=n-1; i>=0; i--){
544:     v    = aa + 4*diag[i] - 4;
545:     vi   = aj + diag[i] - 1;
546:     nz   = diag[i] - ai[i];
547:     idt  = 2*i;
548:     s1 = t[idt];  s2 = t[1+idt];
549:     while (nz--) {
550:       idx   = 2*(*vi--);
551:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
552:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
553:       v -= 4;
554:     }
555:   }

557:   /* copy t into x according to permutation */
558:   ii = 0;
559:   for (i=0; i<n; i++) {
560:     ir      = 2*r[i];
561:     x[ir]   = t[ii];
562:     x[ir+1] = t[ii+1];
563:     ii += 2;
564:   }

566:   ISRestoreIndices(isrow,&rout);
567:   ISRestoreIndices(iscol,&cout);
568:   VecRestoreArray(bb,&b);
569:   VecRestoreArray(xx,&x);
570:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
571:   return(0);
572: }

574: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
575: {
576:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
577:   IS              iscol=a->col,isrow=a->row;
578:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
579:   int             *diag = a->diag,ii,ic,ir,oidx;
580:   MatScalar       *aa=a->a,*v;
581:   Scalar          s1,s2,s3,x1,x2,x3;
582:   Scalar          *x,*b,*t;

585:   VecGetArray(bb,&b);
586:   VecGetArray(xx,&x);
587:   t  = a->solve_work;

589:   ISGetIndices(isrow,&rout); r = rout;
590:   ISGetIndices(iscol,&cout); c = cout;

592:   /* copy the b into temp work space according to permutation */
593:   ii = 0;
594:   for (i=0; i<n; i++) {
595:     ic      = 3*c[i];
596:     t[ii]   = b[ic];
597:     t[ii+1] = b[ic+1];
598:     t[ii+2] = b[ic+2];
599:     ii += 3;
600:   }

602:   /* forward solve the U^T */
603:   idx = 0;
604:   for (i=0; i<n; i++) {

606:     v     = aa + 9*diag[i];
607:     /* multiply by the inverse of the block diagonal */
608:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
609:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
610:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
611:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
612:     v += 9;

614:     vi    = aj + diag[i] + 1;
615:     nz    = ai[i+1] - diag[i] - 1;
616:     while (nz--) {
617:       oidx = 3*(*vi++);
618:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
619:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
620:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
621:       v  += 9;
622:     }
623:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
624:     idx += 3;
625:   }
626:   /* backward solve the L^T */
627:   for (i=n-1; i>=0; i--){
628:     v    = aa + 9*diag[i] - 9;
629:     vi   = aj + diag[i] - 1;
630:     nz   = diag[i] - ai[i];
631:     idt  = 3*i;
632:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
633:     while (nz--) {
634:       idx   = 3*(*vi--);
635:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
636:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
637:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
638:       v -= 9;
639:     }
640:   }

642:   /* copy t into x according to permutation */
643:   ii = 0;
644:   for (i=0; i<n; i++) {
645:     ir      = 3*r[i];
646:     x[ir]   = t[ii];
647:     x[ir+1] = t[ii+1];
648:     x[ir+2] = t[ii+2];
649:     ii += 3;
650:   }

652:   ISRestoreIndices(isrow,&rout);
653:   ISRestoreIndices(iscol,&cout);
654:   VecRestoreArray(bb,&b);
655:   VecRestoreArray(xx,&x);
656:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
657:   return(0);
658: }

660: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
661: {
662:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
663:   IS              iscol=a->col,isrow=a->row;
664:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
665:   int             *diag = a->diag,ii,ic,ir,oidx;
666:   MatScalar       *aa=a->a,*v;
667:   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
668:   Scalar          *x,*b,*t;

671:   VecGetArray(bb,&b);
672:   VecGetArray(xx,&x);
673:   t  = a->solve_work;

675:   ISGetIndices(isrow,&rout); r = rout;
676:   ISGetIndices(iscol,&cout); c = cout;

678:   /* copy the b into temp work space according to permutation */
679:   ii = 0;
680:   for (i=0; i<n; i++) {
681:     ic      = 4*c[i];
682:     t[ii]   = b[ic];
683:     t[ii+1] = b[ic+1];
684:     t[ii+2] = b[ic+2];
685:     t[ii+3] = b[ic+3];
686:     ii += 4;
687:   }

689:   /* forward solve the U^T */
690:   idx = 0;
691:   for (i=0; i<n; i++) {

693:     v     = aa + 16*diag[i];
694:     /* multiply by the inverse of the block diagonal */
695:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
696:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
697:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
698:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
699:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
700:     v += 16;

702:     vi    = aj + diag[i] + 1;
703:     nz    = ai[i+1] - diag[i] - 1;
704:     while (nz--) {
705:       oidx = 4*(*vi++);
706:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
707:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
708:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
709:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
710:       v  += 16;
711:     }
712:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
713:     idx += 4;
714:   }
715:   /* backward solve the L^T */
716:   for (i=n-1; i>=0; i--){
717:     v    = aa + 16*diag[i] - 16;
718:     vi   = aj + diag[i] - 1;
719:     nz   = diag[i] - ai[i];
720:     idt  = 4*i;
721:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
722:     while (nz--) {
723:       idx   = 4*(*vi--);
724:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
725:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
726:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
727:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
728:       v -= 16;
729:     }
730:   }

732:   /* copy t into x according to permutation */
733:   ii = 0;
734:   for (i=0; i<n; i++) {
735:     ir      = 4*r[i];
736:     x[ir]   = t[ii];
737:     x[ir+1] = t[ii+1];
738:     x[ir+2] = t[ii+2];
739:     x[ir+3] = t[ii+3];
740:     ii += 4;
741:   }

743:   ISRestoreIndices(isrow,&rout);
744:   ISRestoreIndices(iscol,&cout);
745:   VecRestoreArray(bb,&b);
746:   VecRestoreArray(xx,&x);
747:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
748:   return(0);
749: }

751: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
752: {
753:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
754:   IS              iscol=a->col,isrow=a->row;
755:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
756:   int             *diag = a->diag,ii,ic,ir,oidx;
757:   MatScalar       *aa=a->a,*v;
758:   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
759:   Scalar          *x,*b,*t;

762:   VecGetArray(bb,&b);
763:   VecGetArray(xx,&x);
764:   t  = a->solve_work;

766:   ISGetIndices(isrow,&rout); r = rout;
767:   ISGetIndices(iscol,&cout); c = cout;

769:   /* copy the b into temp work space according to permutation */
770:   ii = 0;
771:   for (i=0; i<n; i++) {
772:     ic      = 5*c[i];
773:     t[ii]   = b[ic];
774:     t[ii+1] = b[ic+1];
775:     t[ii+2] = b[ic+2];
776:     t[ii+3] = b[ic+3];
777:     t[ii+4] = b[ic+4];
778:     ii += 5;
779:   }

781:   /* forward solve the U^T */
782:   idx = 0;
783:   for (i=0; i<n; i++) {

785:     v     = aa + 25*diag[i];
786:     /* multiply by the inverse of the block diagonal */
787:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
788:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
789:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
790:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
791:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
792:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
793:     v += 25;

795:     vi    = aj + diag[i] + 1;
796:     nz    = ai[i+1] - diag[i] - 1;
797:     while (nz--) {
798:       oidx = 5*(*vi++);
799:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
800:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
801:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
802:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
803:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
804:       v  += 25;
805:     }
806:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
807:     idx += 5;
808:   }
809:   /* backward solve the L^T */
810:   for (i=n-1; i>=0; i--){
811:     v    = aa + 25*diag[i] - 25;
812:     vi   = aj + diag[i] - 1;
813:     nz   = diag[i] - ai[i];
814:     idt  = 5*i;
815:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
816:     while (nz--) {
817:       idx   = 5*(*vi--);
818:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
819:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
820:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
821:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
822:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
823:       v -= 25;
824:     }
825:   }

827:   /* copy t into x according to permutation */
828:   ii = 0;
829:   for (i=0; i<n; i++) {
830:     ir      = 5*r[i];
831:     x[ir]   = t[ii];
832:     x[ir+1] = t[ii+1];
833:     x[ir+2] = t[ii+2];
834:     x[ir+3] = t[ii+3];
835:     x[ir+4] = t[ii+4];
836:     ii += 5;
837:   }

839:   ISRestoreIndices(isrow,&rout);
840:   ISRestoreIndices(iscol,&cout);
841:   VecRestoreArray(bb,&b);
842:   VecRestoreArray(xx,&x);
843:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
844:   return(0);
845: }

847: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
848: {
849:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
850:   IS              iscol=a->col,isrow=a->row;
851:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
852:   int             *diag = a->diag,ii,ic,ir,oidx;
853:   MatScalar       *aa=a->a,*v;
854:   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
855:   Scalar          *x,*b,*t;

858:   VecGetArray(bb,&b);
859:   VecGetArray(xx,&x);
860:   t  = a->solve_work;

862:   ISGetIndices(isrow,&rout); r = rout;
863:   ISGetIndices(iscol,&cout); c = cout;

865:   /* copy the b into temp work space according to permutation */
866:   ii = 0;
867:   for (i=0; i<n; i++) {
868:     ic      = 6*c[i];
869:     t[ii]   = b[ic];
870:     t[ii+1] = b[ic+1];
871:     t[ii+2] = b[ic+2];
872:     t[ii+3] = b[ic+3];
873:     t[ii+4] = b[ic+4];
874:     t[ii+5] = b[ic+5];
875:     ii += 6;
876:   }

878:   /* forward solve the U^T */
879:   idx = 0;
880:   for (i=0; i<n; i++) {

882:     v     = aa + 36*diag[i];
883:     /* multiply by the inverse of the block diagonal */
884:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
885:     x6    = t[5+idx];
886:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
887:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
888:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
889:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
890:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
891:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
892:     v += 36;

894:     vi    = aj + diag[i] + 1;
895:     nz    = ai[i+1] - diag[i] - 1;
896:     while (nz--) {
897:       oidx = 6*(*vi++);
898:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
899:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
900:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
901:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
902:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
903:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
904:       v  += 36;
905:     }
906:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
907:     t[5+idx] = s6;
908:     idx += 6;
909:   }
910:   /* backward solve the L^T */
911:   for (i=n-1; i>=0; i--){
912:     v    = aa + 36*diag[i] - 36;
913:     vi   = aj + diag[i] - 1;
914:     nz   = diag[i] - ai[i];
915:     idt  = 6*i;
916:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
917:     s6 = t[5+idt];
918:     while (nz--) {
919:       idx   = 6*(*vi--);
920:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
921:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
922:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
923:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
924:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
925:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
926:       v -= 36;
927:     }
928:   }

930:   /* copy t into x according to permutation */
931:   ii = 0;
932:   for (i=0; i<n; i++) {
933:     ir      = 6*r[i];
934:     x[ir]   = t[ii];
935:     x[ir+1] = t[ii+1];
936:     x[ir+2] = t[ii+2];
937:     x[ir+3] = t[ii+3];
938:     x[ir+4] = t[ii+4];
939:     x[ir+5] = t[ii+5];
940:     ii += 6;
941:   }

943:   ISRestoreIndices(isrow,&rout);
944:   ISRestoreIndices(iscol,&cout);
945:   VecRestoreArray(bb,&b);
946:   VecRestoreArray(xx,&x);
947:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
948:   return(0);
949: }

951: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
952: {
953:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
954:   IS              iscol=a->col,isrow=a->row;
955:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
956:   int             *diag = a->diag,ii,ic,ir,oidx;
957:   MatScalar       *aa=a->a,*v;
958:   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
959:   Scalar          *x,*b,*t;

962:   VecGetArray(bb,&b);
963:   VecGetArray(xx,&x);
964:   t  = a->solve_work;

966:   ISGetIndices(isrow,&rout); r = rout;
967:   ISGetIndices(iscol,&cout); c = cout;

969:   /* copy the b into temp work space according to permutation */
970:   ii = 0;
971:   for (i=0; i<n; i++) {
972:     ic      = 7*c[i];
973:     t[ii]   = b[ic];
974:     t[ii+1] = b[ic+1];
975:     t[ii+2] = b[ic+2];
976:     t[ii+3] = b[ic+3];
977:     t[ii+4] = b[ic+4];
978:     t[ii+5] = b[ic+5];
979:     t[ii+6] = b[ic+6];
980:     ii += 7;
981:   }

983:   /* forward solve the U^T */
984:   idx = 0;
985:   for (i=0; i<n; i++) {

987:     v     = aa + 49*diag[i];
988:     /* multiply by the inverse of the block diagonal */
989:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
990:     x6    = t[5+idx]; x7 = t[6+idx];
991:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
992:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
993:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
994:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
995:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
996:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
997:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
998:     v += 49;

1000:     vi    = aj + diag[i] + 1;
1001:     nz    = ai[i+1] - diag[i] - 1;
1002:     while (nz--) {
1003:       oidx = 7*(*vi++);
1004:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1005:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1006:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1007:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1008:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1009:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1010:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1011:       v  += 49;
1012:     }
1013:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1014:     t[5+idx] = s6;t[6+idx] = s7;
1015:     idx += 7;
1016:   }
1017:   /* backward solve the L^T */
1018:   for (i=n-1; i>=0; i--){
1019:     v    = aa + 49*diag[i] - 49;
1020:     vi   = aj + diag[i] - 1;
1021:     nz   = diag[i] - ai[i];
1022:     idt  = 7*i;
1023:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1024:     s6 = t[5+idt];s7 = t[6+idt];
1025:     while (nz--) {
1026:       idx   = 7*(*vi--);
1027:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1028:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1029:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1030:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1031:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1032:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1033:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1034:       v -= 49;
1035:     }
1036:   }

1038:   /* copy t into x according to permutation */
1039:   ii = 0;
1040:   for (i=0; i<n; i++) {
1041:     ir      = 7*r[i];
1042:     x[ir]   = t[ii];
1043:     x[ir+1] = t[ii+1];
1044:     x[ir+2] = t[ii+2];
1045:     x[ir+3] = t[ii+3];
1046:     x[ir+4] = t[ii+4];
1047:     x[ir+5] = t[ii+5];
1048:     x[ir+6] = t[ii+6];
1049:     ii += 7;
1050:   }

1052:   ISRestoreIndices(isrow,&rout);
1053:   ISRestoreIndices(iscol,&cout);
1054:   VecRestoreArray(bb,&b);
1055:   VecRestoreArray(xx,&x);
1056:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1057:   return(0);
1058: }

1060: /* ----------------------------------------------------------- */
1061: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1062: {
1063:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1064:   IS              iscol=a->col,isrow=a->row;
1065:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1066:   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1067:   MatScalar       *aa=a->a,*v;
1068:   Scalar          *x,*b,*s,*t,*ls;

1071:   VecGetArray(bb,&b);
1072:   VecGetArray(xx,&x);
1073:   t  = a->solve_work;

1075:   ISGetIndices(isrow,&rout); r = rout;
1076:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1078:   /* forward solve the lower triangular */
1079:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));
1080:   for (i=1; i<n; i++) {
1081:     v   = aa + bs2*ai[i];
1082:     vi  = aj + ai[i];
1083:     nz  = a->diag[i] - ai[i];
1084:     s = t + bs*i;
1085:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));
1086:     while (nz--) {
1087:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1088:       v += bs2;
1089:     }
1090:   }
1091:   /* backward solve the upper triangular */
1092:   ls = a->solve_work + A->n;
1093:   for (i=n-1; i>=0; i--){
1094:     v   = aa + bs2*(a->diag[i] + 1);
1095:     vi  = aj + a->diag[i] + 1;
1096:     nz  = ai[i+1] - a->diag[i] - 1;
1097:     PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));
1098:     while (nz--) {
1099:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1100:       v += bs2;
1101:     }
1102:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1103:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));
1104:   }

1106:   ISRestoreIndices(isrow,&rout);
1107:   ISRestoreIndices(iscol,&cout);
1108:   VecRestoreArray(bb,&b);
1109:   VecRestoreArray(xx,&x);
1110:   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1111:   return(0);
1112: }

1114: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1115: {
1116:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1117:   IS              iscol=a->col,isrow=a->row;
1118:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1119:   int             *diag = a->diag;
1120:   MatScalar       *aa=a->a,*v;
1121:   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1122:   Scalar          *x,*b,*t;

1125:   VecGetArray(bb,&b);
1126:   VecGetArray(xx,&x);
1127:   t  = a->solve_work;

1129:   ISGetIndices(isrow,&rout); r = rout;
1130:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1132:   /* forward solve the lower triangular */
1133:   idx    = 7*(*r++);
1134:   t[0] = b[idx];   t[1] = b[1+idx];
1135:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1136:   t[5] = b[5+idx]; t[6] = b[6+idx];

1138:   for (i=1; i<n; i++) {
1139:     v     = aa + 49*ai[i];
1140:     vi    = aj + ai[i];
1141:     nz    = diag[i] - ai[i];
1142:     idx   = 7*(*r++);
1143:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1144:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1145:     while (nz--) {
1146:       idx   = 7*(*vi++);
1147:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1148:       x4    = t[3+idx];x5 = t[4+idx];
1149:       x6    = t[5+idx];x7 = t[6+idx];
1150:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1151:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1152:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1153:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1154:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1155:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1156:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1157:       v += 49;
1158:     }
1159:     idx = 7*i;
1160:     t[idx]   = s1;t[1+idx] = s2;
1161:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1162:     t[5+idx] = s6;t[6+idx] = s7;
1163:   }
1164:   /* backward solve the upper triangular */
1165:   for (i=n-1; i>=0; i--){
1166:     v    = aa + 49*diag[i] + 49;
1167:     vi   = aj + diag[i] + 1;
1168:     nz   = ai[i+1] - diag[i] - 1;
1169:     idt  = 7*i;
1170:     s1 = t[idt];  s2 = t[1+idt];
1171:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1172:     s6 = t[5+idt];s7 = t[6+idt];
1173:     while (nz--) {
1174:       idx   = 7*(*vi++);
1175:       x1    = t[idx];   x2 = t[1+idx];
1176:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1177:       x6    = t[5+idx]; x7 = t[6+idx];
1178:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1179:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1180:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1181:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1182:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1183:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1184:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1185:       v += 49;
1186:     }
1187:     idc = 7*(*c--);
1188:     v   = aa + 49*diag[i];
1189:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1190:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1191:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1192:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1193:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1194:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1195:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1196:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1197:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1198:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1199:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1200:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1201:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1202:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1203:   }

1205:   ISRestoreIndices(isrow,&rout);
1206:   ISRestoreIndices(iscol,&cout);
1207:   VecRestoreArray(bb,&b);
1208:   VecRestoreArray(xx,&x);
1209:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1210:   return(0);
1211: }

1213: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1214: {
1215:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1216:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1217:   int             ierr,*diag = a->diag,jdx;
1218:   MatScalar       *aa=a->a,*v;
1219:   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1222:   VecGetArray(bb,&b);
1223:   VecGetArray(xx,&x);
1224:   /* forward solve the lower triangular */
1225:   idx    = 0;
1226:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1227:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1228:   x[6] = b[6+idx];
1229:   for (i=1; i<n; i++) {
1230:     v     =  aa + 49*ai[i];
1231:     vi    =  aj + ai[i];
1232:     nz    =  diag[i] - ai[i];
1233:     idx   =  7*i;
1234:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1235:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1236:     s7  =  b[6+idx];
1237:     while (nz--) {
1238:       jdx   = 7*(*vi++);
1239:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1240:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1241:       x7    = x[6+jdx];
1242:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1243:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1244:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1245:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1246:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1247:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1248:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1249:       v += 49;
1250:      }
1251:     x[idx]   = s1;
1252:     x[1+idx] = s2;
1253:     x[2+idx] = s3;
1254:     x[3+idx] = s4;
1255:     x[4+idx] = s5;
1256:     x[5+idx] = s6;
1257:     x[6+idx] = s7;
1258:   }
1259:   /* backward solve the upper triangular */
1260:   for (i=n-1; i>=0; i--){
1261:     v    = aa + 49*diag[i] + 49;
1262:     vi   = aj + diag[i] + 1;
1263:     nz   = ai[i+1] - diag[i] - 1;
1264:     idt  = 7*i;
1265:     s1 = x[idt];   s2 = x[1+idt];
1266:     s3 = x[2+idt]; s4 = x[3+idt];
1267:     s5 = x[4+idt]; s6 = x[5+idt];
1268:     s7 = x[6+idt];
1269:     while (nz--) {
1270:       idx   = 7*(*vi++);
1271:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1272:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1273:       x7    = x[6+idx];
1274:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1275:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1276:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1277:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1278:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1279:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1280:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1281:       v += 49;
1282:     }
1283:     v        = aa + 49*diag[i];
1284:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1285:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1286:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1287:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1288:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1289:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1290:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1291:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1292:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1293:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1294:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1295:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1296:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1297:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1298:   }

1300:   VecRestoreArray(bb,&b);
1301:   VecRestoreArray(xx,&x);
1302:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1303:   return(0);
1304: }

1306: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1307: {
1308:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1309:   IS              iscol=a->col,isrow=a->row;
1310:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1311:   int             *diag = a->diag;
1312:   MatScalar       *aa=a->a,*v;
1313:   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1316:   VecGetArray(bb,&b);
1317:   VecGetArray(xx,&x);
1318:   t  = a->solve_work;

1320:   ISGetIndices(isrow,&rout); r = rout;
1321:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1323:   /* forward solve the lower triangular */
1324:   idx    = 6*(*r++);
1325:   t[0] = b[idx];   t[1] = b[1+idx];
1326:   t[2] = b[2+idx]; t[3] = b[3+idx];
1327:   t[4] = b[4+idx]; t[5] = b[5+idx];
1328:   for (i=1; i<n; i++) {
1329:     v     = aa + 36*ai[i];
1330:     vi    = aj + ai[i];
1331:     nz    = diag[i] - ai[i];
1332:     idx   = 6*(*r++);
1333:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1334:     s5  = b[4+idx]; s6 = b[5+idx];
1335:     while (nz--) {
1336:       idx   = 6*(*vi++);
1337:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1338:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1339:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1340:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1341:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1342:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1343:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1344:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1345:       v += 36;
1346:     }
1347:     idx = 6*i;
1348:     t[idx]   = s1;t[1+idx] = s2;
1349:     t[2+idx] = s3;t[3+idx] = s4;
1350:     t[4+idx] = s5;t[5+idx] = s6;
1351:   }
1352:   /* backward solve the upper triangular */
1353:   for (i=n-1; i>=0; i--){
1354:     v    = aa + 36*diag[i] + 36;
1355:     vi   = aj + diag[i] + 1;
1356:     nz   = ai[i+1] - diag[i] - 1;
1357:     idt  = 6*i;
1358:     s1 = t[idt];  s2 = t[1+idt];
1359:     s3 = t[2+idt];s4 = t[3+idt];
1360:     s5 = t[4+idt];s6 = t[5+idt];
1361:     while (nz--) {
1362:       idx   = 6*(*vi++);
1363:       x1    = t[idx];   x2 = t[1+idx];
1364:       x3    = t[2+idx]; x4 = t[3+idx];
1365:       x5    = t[4+idx]; x6 = t[5+idx];
1366:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1367:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1368:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1369:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1370:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1371:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1372:       v += 36;
1373:     }
1374:     idc = 6*(*c--);
1375:     v   = aa + 36*diag[i];
1376:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1377:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1378:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1379:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1380:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1381:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1382:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1383:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1384:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1385:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1386:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1387:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1388:   }

1390:   ISRestoreIndices(isrow,&rout);
1391:   ISRestoreIndices(iscol,&cout);
1392:   VecRestoreArray(bb,&b);
1393:   VecRestoreArray(xx,&x);
1394:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1395:   return(0);
1396: }

1398: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1399: {
1400:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1401:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1402:   int             ierr,*diag = a->diag,jdx;
1403:   MatScalar       *aa=a->a,*v;
1404:   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1407:   VecGetArray(bb,&b);
1408:   VecGetArray(xx,&x);
1409:   /* forward solve the lower triangular */
1410:   idx    = 0;
1411:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1412:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1413:   for (i=1; i<n; i++) {
1414:     v     =  aa + 36*ai[i];
1415:     vi    =  aj + ai[i];
1416:     nz    =  diag[i] - ai[i];
1417:     idx   =  6*i;
1418:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1419:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1420:     while (nz--) {
1421:       jdx   = 6*(*vi++);
1422:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1423:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1424:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1425:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1426:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1427:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1428:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1429:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1430:       v += 36;
1431:      }
1432:     x[idx]   = s1;
1433:     x[1+idx] = s2;
1434:     x[2+idx] = s3;
1435:     x[3+idx] = s4;
1436:     x[4+idx] = s5;
1437:     x[5+idx] = s6;
1438:   }
1439:   /* backward solve the upper triangular */
1440:   for (i=n-1; i>=0; i--){
1441:     v    = aa + 36*diag[i] + 36;
1442:     vi   = aj + diag[i] + 1;
1443:     nz   = ai[i+1] - diag[i] - 1;
1444:     idt  = 6*i;
1445:     s1 = x[idt];   s2 = x[1+idt];
1446:     s3 = x[2+idt]; s4 = x[3+idt];
1447:     s5 = x[4+idt]; s6 = x[5+idt];
1448:     while (nz--) {
1449:       idx   = 6*(*vi++);
1450:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1451:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1452:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1453:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1454:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1455:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1456:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1457:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1458:       v += 36;
1459:     }
1460:     v        = aa + 36*diag[i];
1461:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1462:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1463:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1464:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1465:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1466:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1467:   }

1469:   VecRestoreArray(bb,&b);
1470:   VecRestoreArray(xx,&x);
1471:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1472:   return(0);
1473: }

1475: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1476: {
1477:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1478:   IS              iscol=a->col,isrow=a->row;
1479:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1480:   int             *diag = a->diag;
1481:   MatScalar       *aa=a->a,*v;
1482:   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1485:   VecGetArray(bb,&b);
1486:   VecGetArray(xx,&x);
1487:   t  = a->solve_work;

1489:   ISGetIndices(isrow,&rout); r = rout;
1490:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1492:   /* forward solve the lower triangular */
1493:   idx    = 5*(*r++);
1494:   t[0] = b[idx];   t[1] = b[1+idx];
1495:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1496:   for (i=1; i<n; i++) {
1497:     v     = aa + 25*ai[i];
1498:     vi    = aj + ai[i];
1499:     nz    = diag[i] - ai[i];
1500:     idx   = 5*(*r++);
1501:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1502:     s5  = b[4+idx];
1503:     while (nz--) {
1504:       idx   = 5*(*vi++);
1505:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1506:       x4    = t[3+idx];x5 = t[4+idx];
1507:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1508:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1509:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1510:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1511:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1512:       v += 25;
1513:     }
1514:     idx = 5*i;
1515:     t[idx]   = s1;t[1+idx] = s2;
1516:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1517:   }
1518:   /* backward solve the upper triangular */
1519:   for (i=n-1; i>=0; i--){
1520:     v    = aa + 25*diag[i] + 25;
1521:     vi   = aj + diag[i] + 1;
1522:     nz   = ai[i+1] - diag[i] - 1;
1523:     idt  = 5*i;
1524:     s1 = t[idt];  s2 = t[1+idt];
1525:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1526:     while (nz--) {
1527:       idx   = 5*(*vi++);
1528:       x1    = t[idx];   x2 = t[1+idx];
1529:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1530:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1531:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1532:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1533:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1534:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1535:       v += 25;
1536:     }
1537:     idc = 5*(*c--);
1538:     v   = aa + 25*diag[i];
1539:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1540:                                  v[15]*s4+v[20]*s5;
1541:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1542:                                  v[16]*s4+v[21]*s5;
1543:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1544:                                  v[17]*s4+v[22]*s5;
1545:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1546:                                  v[18]*s4+v[23]*s5;
1547:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1548:                                  v[19]*s4+v[24]*s5;
1549:   }

1551:   ISRestoreIndices(isrow,&rout);
1552:   ISRestoreIndices(iscol,&cout);
1553:   VecRestoreArray(bb,&b);
1554:   VecRestoreArray(xx,&x);
1555:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1556:   return(0);
1557: }

1559: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1560: {
1561:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1562:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1563:   int             ierr,*diag = a->diag,jdx;
1564:   MatScalar       *aa=a->a,*v;
1565:   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1568:   VecGetArray(bb,&b);
1569:   VecGetArray(xx,&x);
1570:   /* forward solve the lower triangular */
1571:   idx    = 0;
1572:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1573:   for (i=1; i<n; i++) {
1574:     v     =  aa + 25*ai[i];
1575:     vi    =  aj + ai[i];
1576:     nz    =  diag[i] - ai[i];
1577:     idx   =  5*i;
1578:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1579:     while (nz--) {
1580:       jdx   = 5*(*vi++);
1581:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1582:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1583:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1584:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1585:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1586:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1587:       v    += 25;
1588:     }
1589:     x[idx]   = s1;
1590:     x[1+idx] = s2;
1591:     x[2+idx] = s3;
1592:     x[3+idx] = s4;
1593:     x[4+idx] = s5;
1594:   }
1595:   /* backward solve the upper triangular */
1596:   for (i=n-1; i>=0; i--){
1597:     v    = aa + 25*diag[i] + 25;
1598:     vi   = aj + diag[i] + 1;
1599:     nz   = ai[i+1] - diag[i] - 1;
1600:     idt  = 5*i;
1601:     s1 = x[idt];  s2 = x[1+idt];
1602:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1603:     while (nz--) {
1604:       idx   = 5*(*vi++);
1605:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1606:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1607:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1608:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1609:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1610:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1611:       v    += 25;
1612:     }
1613:     v        = aa + 25*diag[i];
1614:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1615:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1616:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1617:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1618:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1619:   }

1621:   VecRestoreArray(bb,&b);
1622:   VecRestoreArray(xx,&x);
1623:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1624:   return(0);
1625: }

1627: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1628: {
1629:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1630:   IS              iscol=a->col,isrow=a->row;
1631:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1632:   int             *diag = a->diag;
1633:   MatScalar       *aa=a->a,*v;
1634:   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1637:   VecGetArray(bb,&b);
1638:   VecGetArray(xx,&x);
1639:   t  = a->solve_work;

1641:   ISGetIndices(isrow,&rout); r = rout;
1642:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1644:   /* forward solve the lower triangular */
1645:   idx    = 4*(*r++);
1646:   t[0] = b[idx];   t[1] = b[1+idx];
1647:   t[2] = b[2+idx]; t[3] = b[3+idx];
1648:   for (i=1; i<n; i++) {
1649:     v     = aa + 16*ai[i];
1650:     vi    = aj + ai[i];
1651:     nz    = diag[i] - ai[i];
1652:     idx   = 4*(*r++);
1653:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1654:     while (nz--) {
1655:       idx   = 4*(*vi++);
1656:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1657:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1658:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1659:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1660:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1661:       v    += 16;
1662:     }
1663:     idx        = 4*i;
1664:     t[idx]   = s1;t[1+idx] = s2;
1665:     t[2+idx] = s3;t[3+idx] = s4;
1666:   }
1667:   /* backward solve the upper triangular */
1668:   for (i=n-1; i>=0; i--){
1669:     v    = aa + 16*diag[i] + 16;
1670:     vi   = aj + diag[i] + 1;
1671:     nz   = ai[i+1] - diag[i] - 1;
1672:     idt  = 4*i;
1673:     s1 = t[idt];  s2 = t[1+idt];
1674:     s3 = t[2+idt];s4 = t[3+idt];
1675:     while (nz--) {
1676:       idx   = 4*(*vi++);
1677:       x1    = t[idx];   x2 = t[1+idx];
1678:       x3    = t[2+idx]; x4 = t[3+idx];
1679:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1680:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1681:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1682:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1683:       v += 16;
1684:     }
1685:     idc      = 4*(*c--);
1686:     v        = aa + 16*diag[i];
1687:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1688:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1689:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1690:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1691:   }

1693:   ISRestoreIndices(isrow,&rout);
1694:   ISRestoreIndices(iscol,&cout);
1695:   VecRestoreArray(bb,&b);
1696:   VecRestoreArray(xx,&x);
1697:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1698:   return(0);
1699: }


1702: /*
1703:       Special case where the matrix was ILU(0) factored in the natural
1704:    ordering. This eliminates the need for the column and row permutation.
1705: */
1706: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
1707: {
1708:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1709:   int             n=a->mbs,*ai=a->i,*aj=a->j;
1710:   int             ierr,*diag = a->diag;
1711:   MatScalar       *aa=a->a;
1712:   Scalar          *x,*b;

1715:   VecGetArray(bb,&b);
1716:   VecGetArray(xx,&x);

1718: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
1719:   {
1720:     static Scalar w[2000]; /* very BAD need to fix */
1721:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
1722:   }
1723: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
1724:   {
1725:     static Scalar w[2000]; /* very BAD need to fix */
1726:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
1727:   }
1728: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
1729:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1730: #else
1731:   {
1732:     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
1733:     MatScalar *v;
1734:     int       jdx,idt,idx,nz,*vi,i,ai16;

1736:   /* forward solve the lower triangular */
1737:   idx    = 0;
1738:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
1739:   for (i=1; i<n; i++) {
1740:     v     =  aa      + 16*ai[i];
1741:     vi    =  aj      + ai[i];
1742:     nz    =  diag[i] - ai[i];
1743:     idx   +=  4;
1744:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1745:     while (nz--) {
1746:       jdx   = 4*(*vi++);
1747:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
1748:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1749:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1750:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1751:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1752:       v    += 16;
1753:     }
1754:     x[idx]   = s1;
1755:     x[1+idx] = s2;
1756:     x[2+idx] = s3;
1757:     x[3+idx] = s4;
1758:   }
1759:   /* backward solve the upper triangular */
1760:   idt = 4*(n-1);
1761:   for (i=n-1; i>=0; i--){
1762:     ai16 = 16*diag[i];
1763:     v    = aa + ai16 + 16;
1764:     vi   = aj + diag[i] + 1;
1765:     nz   = ai[i+1] - diag[i] - 1;
1766:     s1 = x[idt];  s2 = x[1+idt];
1767:     s3 = x[2+idt];s4 = x[3+idt];
1768:     while (nz--) {
1769:       idx   = 4*(*vi++);
1770:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
1771:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1772:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1773:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1774:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1775:       v    += 16;
1776:     }
1777:     v        = aa + ai16;
1778:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
1779:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
1780:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
1781:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
1782:     idt -= 4;
1783:   }
1784:   }
1785: #endif

1787:   VecRestoreArray(bb,&b);
1788:   VecRestoreArray(xx,&x);
1789:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1790:   return(0);
1791: }

1793: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1794: {
1795:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1796:   IS              iscol=a->col,isrow=a->row;
1797:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1798:   int             *diag = a->diag;
1799:   MatScalar       *aa=a->a,*v;
1800:   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;

1803:   VecGetArray(bb,&b);
1804:   VecGetArray(xx,&x);
1805:   t  = a->solve_work;

1807:   ISGetIndices(isrow,&rout); r = rout;
1808:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1810:   /* forward solve the lower triangular */
1811:   idx    = 3*(*r++);
1812:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
1813:   for (i=1; i<n; i++) {
1814:     v     = aa + 9*ai[i];
1815:     vi    = aj + ai[i];
1816:     nz    = diag[i] - ai[i];
1817:     idx   = 3*(*r++);
1818:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1819:     while (nz--) {
1820:       idx   = 3*(*vi++);
1821:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1822:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1823:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1824:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1825:       v += 9;
1826:     }
1827:     idx = 3*i;
1828:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
1829:   }
1830:   /* backward solve the upper triangular */
1831:   for (i=n-1; i>=0; i--){
1832:     v    = aa + 9*diag[i] + 9;
1833:     vi   = aj + diag[i] + 1;
1834:     nz   = ai[i+1] - diag[i] - 1;
1835:     idt  = 3*i;
1836:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1837:     while (nz--) {
1838:       idx   = 3*(*vi++);
1839:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1840:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1841:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1842:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1843:       v += 9;
1844:     }
1845:     idc = 3*(*c--);
1846:     v   = aa + 9*diag[i];
1847:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1848:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1849:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1850:   }
1851:   ISRestoreIndices(isrow,&rout);
1852:   ISRestoreIndices(iscol,&cout);
1853:   VecRestoreArray(bb,&b);
1854:   VecRestoreArray(xx,&x);
1855:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
1856:   return(0);
1857: }

1859: /*
1860:       Special case where the matrix was ILU(0) factored in the natural
1861:    ordering. This eliminates the need for the column and row permutation.
1862: */
1863: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1864: {
1865:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1866:   int             n=a->mbs,*ai=a->i,*aj=a->j;
1867:   int             ierr,*diag = a->diag;
1868:   MatScalar       *aa=a->a,*v;
1869:   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
1870:   int             jdx,idt,idx,nz,*vi,i;

1873:   VecGetArray(bb,&b);
1874:   VecGetArray(xx,&x);


1877:   /* forward solve the lower triangular */
1878:   idx    = 0;
1879:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
1880:   for (i=1; i<n; i++) {
1881:     v     =  aa      + 9*ai[i];
1882:     vi    =  aj      + ai[i];
1883:     nz    =  diag[i] - ai[i];
1884:     idx   +=  3;
1885:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
1886:     while (nz--) {
1887:       jdx   = 3*(*vi++);
1888:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
1889:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1890:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1891:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1892:       v    += 9;
1893:     }
1894:     x[idx]   = s1;
1895:     x[1+idx] = s2;
1896:     x[2+idx] = s3;
1897:   }
1898:   /* backward solve the upper triangular */
1899:   for (i=n-1; i>=0; i--){
1900:     v    = aa + 9*diag[i] + 9;
1901:     vi   = aj + diag[i] + 1;
1902:     nz   = ai[i+1] - diag[i] - 1;
1903:     idt  = 3*i;
1904:     s1 = x[idt];  s2 = x[1+idt];
1905:     s3 = x[2+idt];
1906:     while (nz--) {
1907:       idx   = 3*(*vi++);
1908:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
1909:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
1910:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
1911:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
1912:       v    += 9;
1913:     }
1914:     v        = aa +  9*diag[i];
1915:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
1916:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
1917:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
1918:   }

1920:   VecRestoreArray(bb,&b);
1921:   VecRestoreArray(xx,&x);
1922:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
1923:   return(0);
1924: }

1926: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1927: {
1928:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1929:   IS              iscol=a->col,isrow=a->row;
1930:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1931:   int             *diag = a->diag;
1932:   MatScalar       *aa=a->a,*v;
1933:   Scalar          *x,*b,s1,s2,x1,x2,*t;

1936:   VecGetArray(bb,&b);
1937:   VecGetArray(xx,&x);
1938:   t  = a->solve_work;

1940:   ISGetIndices(isrow,&rout); r = rout;
1941:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1943:   /* forward solve the lower triangular */
1944:   idx    = 2*(*r++);
1945:   t[0] = b[idx]; t[1] = b[1+idx];
1946:   for (i=1; i<n; i++) {
1947:     v     = aa + 4*ai[i];
1948:     vi    = aj + ai[i];
1949:     nz    = diag[i] - ai[i];
1950:     idx   = 2*(*r++);
1951:     s1  = b[idx]; s2 = b[1+idx];
1952:     while (nz--) {
1953:       idx   = 2*(*vi++);
1954:       x1    = t[idx]; x2 = t[1+idx];
1955:       s1 -= v[0]*x1 + v[2]*x2;
1956:       s2 -= v[1]*x1 + v[3]*x2;
1957:       v += 4;
1958:     }
1959:     idx = 2*i;
1960:     t[idx] = s1; t[1+idx] = s2;
1961:   }
1962:   /* backward solve the upper triangular */
1963:   for (i=n-1; i>=0; i--){
1964:     v    = aa + 4*diag[i] + 4;
1965:     vi   = aj + diag[i] + 1;
1966:     nz   = ai[i+1] - diag[i] - 1;
1967:     idt  = 2*i;
1968:     s1 = t[idt]; s2 = t[1+idt];
1969:     while (nz--) {
1970:       idx   = 2*(*vi++);
1971:       x1    = t[idx]; x2 = t[1+idx];
1972:       s1 -= v[0]*x1 + v[2]*x2;
1973:       s2 -= v[1]*x1 + v[3]*x2;
1974:       v += 4;
1975:     }
1976:     idc = 2*(*c--);
1977:     v   = aa + 4*diag[i];
1978:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
1979:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
1980:   }
1981:   ISRestoreIndices(isrow,&rout);
1982:   ISRestoreIndices(iscol,&cout);
1983:   VecRestoreArray(bb,&b);
1984:   VecRestoreArray(xx,&x);
1985:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
1986:   return(0);
1987: }

1989: /*
1990:       Special case where the matrix was ILU(0) factored in the natural
1991:    ordering. This eliminates the need for the column and row permutation.
1992: */
1993: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1994: {
1995:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1996:   int             n=a->mbs,*ai=a->i,*aj=a->j;
1997:   int             ierr,*diag = a->diag;
1998:   MatScalar       *aa=a->a,*v;
1999:   Scalar          *x,*b,s1,s2,x1,x2;
2000:   int             jdx,idt,idx,nz,*vi,i;

2003:   VecGetArray(bb,&b);
2004:   VecGetArray(xx,&x);

2006:   /* forward solve the lower triangular */
2007:   idx    = 0;
2008:   x[0]   = b[0]; x[1] = b[1];
2009:   for (i=1; i<n; i++) {
2010:     v     =  aa      + 4*ai[i];
2011:     vi    =  aj      + ai[i];
2012:     nz    =  diag[i] - ai[i];
2013:     idx   +=  2;
2014:     s1  =  b[idx];s2 = b[1+idx];
2015:     while (nz--) {
2016:       jdx   = 2*(*vi++);
2017:       x1    = x[jdx];x2 = x[1+jdx];
2018:       s1 -= v[0]*x1 + v[2]*x2;
2019:       s2 -= v[1]*x1 + v[3]*x2;
2020:       v    += 4;
2021:     }
2022:     x[idx]   = s1;
2023:     x[1+idx] = s2;
2024:   }
2025:   /* backward solve the upper triangular */
2026:   for (i=n-1; i>=0; i--){
2027:     v    = aa + 4*diag[i] + 4;
2028:     vi   = aj + diag[i] + 1;
2029:     nz   = ai[i+1] - diag[i] - 1;
2030:     idt  = 2*i;
2031:     s1 = x[idt];  s2 = x[1+idt];
2032:     while (nz--) {
2033:       idx   = 2*(*vi++);
2034:       x1    = x[idx];   x2 = x[1+idx];
2035:       s1 -= v[0]*x1 + v[2]*x2;
2036:       s2 -= v[1]*x1 + v[3]*x2;
2037:       v    += 4;
2038:     }
2039:     v        = aa +  4*diag[i];
2040:     x[idt]   = v[0]*s1 + v[2]*s2;
2041:     x[1+idt] = v[1]*s1 + v[3]*s2;
2042:   }

2044:   VecRestoreArray(bb,&b);
2045:   VecRestoreArray(xx,&x);
2046:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2047:   return(0);
2048: }

2050: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2051: {
2052:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2053:   IS              iscol=a->col,isrow=a->row;
2054:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2055:   int             *diag = a->diag;
2056:   MatScalar       *aa=a->a,*v;
2057:   Scalar          *x,*b,s1,*t;

2060:   if (!n) return(0);

2062:   VecGetArray(bb,&b);
2063:   VecGetArray(xx,&x);
2064:   t  = a->solve_work;

2066:   ISGetIndices(isrow,&rout); r = rout;
2067:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2069:   /* forward solve the lower triangular */
2070:   t[0] = b[*r++];
2071:   for (i=1; i<n; i++) {
2072:     v     = aa + ai[i];
2073:     vi    = aj + ai[i];
2074:     nz    = diag[i] - ai[i];
2075:     s1  = b[*r++];
2076:     while (nz--) {
2077:       s1 -= (*v++)*t[*vi++];
2078:     }
2079:     t[i] = s1;
2080:   }
2081:   /* backward solve the upper triangular */
2082:   for (i=n-1; i>=0; i--){
2083:     v    = aa + diag[i] + 1;
2084:     vi   = aj + diag[i] + 1;
2085:     nz   = ai[i+1] - diag[i] - 1;
2086:     s1 = t[i];
2087:     while (nz--) {
2088:       s1 -= (*v++)*t[*vi++];
2089:     }
2090:     x[*c--] = t[i] = aa[diag[i]]*s1;
2091:   }

2093:   ISRestoreIndices(isrow,&rout);
2094:   ISRestoreIndices(iscol,&cout);
2095:   VecRestoreArray(bb,&b);
2096:   VecRestoreArray(xx,&x);
2097:   PetscLogFlops(2*1*(a->nz) - A->n);
2098:   return(0);
2099: }
2100: /*
2101:       Special case where the matrix was ILU(0) factored in the natural
2102:    ordering. This eliminates the need for the column and row permutation.
2103: */
2104: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2105: {
2106:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2107:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2108:   int             ierr,*diag = a->diag;
2109:   MatScalar       *aa=a->a;
2110:   Scalar          *x,*b;
2111:   Scalar          s1,x1;
2112:   MatScalar       *v;
2113:   int             jdx,idt,idx,nz,*vi,i;

2116:   VecGetArray(bb,&b);
2117:   VecGetArray(xx,&x);

2119:   /* forward solve the lower triangular */
2120:   idx    = 0;
2121:   x[0]   = b[0];
2122:   for (i=1; i<n; i++) {
2123:     v     =  aa      + ai[i];
2124:     vi    =  aj      + ai[i];
2125:     nz    =  diag[i] - ai[i];
2126:     idx   +=  1;
2127:     s1  =  b[idx];
2128:     while (nz--) {
2129:       jdx   = *vi++;
2130:       x1    = x[jdx];
2131:       s1 -= v[0]*x1;
2132:       v    += 1;
2133:     }
2134:     x[idx]   = s1;
2135:   }
2136:   /* backward solve the upper triangular */
2137:   for (i=n-1; i>=0; i--){
2138:     v    = aa + diag[i] + 1;
2139:     vi   = aj + diag[i] + 1;
2140:     nz   = ai[i+1] - diag[i] - 1;
2141:     idt  = i;
2142:     s1 = x[idt];
2143:     while (nz--) {
2144:       idx   = *vi++;
2145:       x1    = x[idx];
2146:       s1 -= v[0]*x1;
2147:       v    += 1;
2148:     }
2149:     v        = aa +  diag[i];
2150:     x[idt]   = v[0]*s1;
2151:   }
2152:   VecRestoreArray(bb,&b);
2153:   VecRestoreArray(xx,&x);
2154:   PetscLogFlops(2*(a->nz) - A->n);
2155:   return(0);
2156: }

2158: /* ----------------------------------------------------------------*/
2159: /*
2160:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2161:    except that the data structure of Mat_SeqAIJ is slightly different.
2162:    Not a good example of code reuse.
2163: */
2164: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);

2166: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2167: {
2168:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2169:   IS          isicol;
2170:   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2171:   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2172:   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2173:   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2174:   PetscTruth  col_identity,row_identity;
2175:   PetscReal   f;


2181:   if (info) {
2182:     f             = info->fill;
2183:     levels        = (int)info->levels;
2184:     diagonal_fill = (int)info->diagonal_fill;
2185:   } else {
2186:     f             = 1.0;
2187:     levels        = 0;
2188:     diagonal_fill = 0;
2189:   }
2190:   if (!isrow) {
2191:     ISCreateStride(PETSC_COMM_SELF,A->M,0,1,&isrow);
2192:   }
2193:   if (!iscol) {
2194:     ISCreateStride(PETSC_COMM_SELF,A->M,0,1,&iscol);
2195:   }
2196:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
2197:   ISIdentity(isrow,&row_identity);
2198:   ISIdentity(iscol,&col_identity);

2200:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2201:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
2202:     (*fact)->factor = FACTOR_LU;
2203:     b               = (Mat_SeqBAIJ*)(*fact)->data;
2204:     if (!b->diag) {
2205:       MatMarkDiagonal_SeqBAIJ(*fact);
2206:     }
2207:     MatMissingDiagonal_SeqBAIJ(*fact);
2208:     b->row        = isrow;
2209:     b->col        = iscol;
2210:     ierr          = PetscObjectReference((PetscObject)isrow);
2211:     ierr          = PetscObjectReference((PetscObject)iscol);
2212:     b->icol       = isicol;
2213:     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(Scalar),&b->solve_work);
2214:   } else { /* general case perform the symbolic factorization */
2215:     ISGetIndices(isrow,&r);
2216:     ISGetIndices(isicol,&ic);

2218:     /* get new row pointers */
2219:     PetscMalloc((n+1)*sizeof(int),&ainew);
2220:     ainew[0] = 0;
2221:     /* don't know how many column pointers are needed so estimate */
2222:     jmax = (int)(f*ai[n] + 1);
2223:     PetscMalloc((jmax)*sizeof(int),&ajnew);
2224:     /* ajfill is level of fill for each fill entry */
2225:     PetscMalloc((jmax)*sizeof(int),&ajfill);
2226:     /* fill is a linked list of nonzeros in active row */
2227:     PetscMalloc((n+1)*sizeof(int),&fill);
2228:     /* im is level for each filled value */
2229:     PetscMalloc((n+1)*sizeof(int),&im);
2230:     /* dloc is location of diagonal in factor */
2231:     PetscMalloc((n+1)*sizeof(int),&dloc);
2232:     dloc[0]  = 0;
2233:     for (prow=0; prow<n; prow++) {

2235:       /* copy prow into linked list */
2236:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2237:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2238:       xi         = aj + ai[r[prow]];
2239:       fill[n]    = n;
2240:       fill[prow] = -1; /* marker for diagonal entry */
2241:       while (nz--) {
2242:         fm  = n;
2243:         idx = ic[*xi++];
2244:         do {
2245:           m  = fm;
2246:           fm = fill[m];
2247:         } while (fm < idx);
2248:         fill[m]   = idx;
2249:         fill[idx] = fm;
2250:         im[idx]   = 0;
2251:       }

2253:       /* make sure diagonal entry is included */
2254:       if (diagonal_fill && fill[prow] == -1) {
2255:         fm = n;
2256:         while (fill[fm] < prow) fm = fill[fm];
2257:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
2258:         fill[fm]   = prow;
2259:         im[prow]   = 0;
2260:         nzf++;
2261:         dcount++;
2262:       }

2264:       nzi = 0;
2265:       row = fill[n];
2266:       while (row < prow) {
2267:         incrlev = im[row] + 1;
2268:         nz      = dloc[row];
2269:         xi      = ajnew  + ainew[row] + nz + 1;
2270:         flev    = ajfill + ainew[row] + nz + 1;
2271:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
2272:         fm      = row;
2273:         while (nnz-- > 0) {
2274:           idx = *xi++;
2275:           if (*flev + incrlev > levels) {
2276:             flev++;
2277:             continue;
2278:           }
2279:           do {
2280:             m  = fm;
2281:             fm = fill[m];
2282:           } while (fm < idx);
2283:           if (fm != idx) {
2284:             im[idx]   = *flev + incrlev;
2285:             fill[m]   = idx;
2286:             fill[idx] = fm;
2287:             fm        = idx;
2288:             nzf++;
2289:           } else {
2290:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2291:           }
2292:           flev++;
2293:         }
2294:         row = fill[row];
2295:         nzi++;
2296:       }
2297:       /* copy new filled row into permanent storage */
2298:       ainew[prow+1] = ainew[prow] + nzf;
2299:       if (ainew[prow+1] > jmax) {

2301:         /* estimate how much additional space we will need */
2302:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2303:         /* just double the memory each time */
2304:         int maxadd = jmax;
2305:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2306:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2307:         jmax += maxadd;

2309:         /* allocate a longer ajnew and ajfill */
2310:         PetscMalloc(jmax*sizeof(int),&xi);
2311:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
2312:         PetscFree(ajnew);
2313:         ajnew = xi;
2314:         PetscMalloc(jmax*sizeof(int),&xi);
2315:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
2316:         PetscFree(ajfill);
2317:         ajfill = xi;
2318:         realloc++; /* count how many reallocations are needed */
2319:       }
2320:       xi          = ajnew + ainew[prow];
2321:       flev        = ajfill + ainew[prow];
2322:       dloc[prow]  = nzi;
2323:       fm          = fill[n];
2324:       while (nzf--) {
2325:         *xi++   = fm;
2326:         *flev++ = im[fm];
2327:         fm      = fill[fm];
2328:       }
2329:       /* make sure row has diagonal entry */
2330:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2331:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrixn
2332:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2333:       }
2334:     }
2335:     PetscFree(ajfill);
2336:     ISRestoreIndices(isrow,&r);
2337:     ISRestoreIndices(isicol,&ic);
2338:     PetscFree(fill);
2339:     PetscFree(im);

2341:     {
2342:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2343:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %gn",realloc,f,af);
2344:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use n",af);
2345:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);n",af);
2346:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.n");
2347:       if (diagonal_fill) {
2348:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2349:       }
2350:     }

2352:     /* put together the new matrix */
2353:     MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
2354:     PetscLogObjectParent(*fact,isicol);
2355:     b = (Mat_SeqBAIJ*)(*fact)->data;
2356:     PetscFree(b->imax);
2357:     b->singlemalloc = PETSC_FALSE;
2358:     len = bs2*ainew[n]*sizeof(MatScalar);
2359:     /* the next line frees the default space generated by the Create() */
2360:     PetscFree(b->a);
2361:     PetscFree(b->ilen);
2362:     PetscMalloc(len,&b->a);
2363:     b->j          = ajnew;
2364:     b->i          = ainew;
2365:     for (i=0; i<n; i++) dloc[i] += ainew[i];
2366:     b->diag       = dloc;
2367:     b->ilen       = 0;
2368:     b->imax       = 0;
2369:     b->row        = isrow;
2370:     b->col        = iscol;
2371:     ierr          = PetscObjectReference((PetscObject)isrow);
2372:     ierr          = PetscObjectReference((PetscObject)iscol);
2373:     b->icol       = isicol;
2374:     PetscMalloc((bs*n+bs)*sizeof(Scalar),&b->solve_work);
2375:     /* In b structure:  Free imax, ilen, old a, old j.  
2376:        Allocate dloc, solve_work, new a, new j */
2377:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
2378:     b->maxnz          = b->nz = ainew[n];
2379:     (*fact)->factor   = FACTOR_LU;

2381:     (*fact)->info.factor_mallocs    = realloc;
2382:     (*fact)->info.fill_ratio_given  = f;
2383:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2384:   }

2386:   if (row_identity && col_identity) {
2387:     switch (b->bs) {
2388:       case 2:
2389:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2390:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2391:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=2n");
2392:         break;
2393:       case 3:
2394:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2395:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2396:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:sing special in-place natural ordering factor and solve BS=3n");
2397:         break;
2398:       case 4:
2399: #if defined(PETSC_HAVE_ICL_SSE)
2400:         PetscTruth sse_enabled;
2401:         PetscSSEIsEnabled(&sse_enabled);
2402:         if (sse_enabled) {
2403:           (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_ICL_SSE;
2404:         } else {
2405:           (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2406:         }
2407: #else
2408:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2409: #endif
2410:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2411:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=4n");
2412:         break;
2413:       case 5:
2414:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2415:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
2416:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=5n");
2417:         break;
2418:       case 6:
2419:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2420:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
2421:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=6n");
2422:         break;
2423:       case 7:
2424:         (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2425:         (*fact)->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
2426:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Using special in-place natural ordering factor and solve BS=7n");
2427:       break;
2428:     }
2429:   }
2430:   return(0);
2431: }