Actual source code: baijfact2.c

  1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/vec/vecimpl.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 11: #undef __FUNCT__  
 13: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 16:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 17:   int             *diag = a->diag;
 18:   MatScalar       *aa=a->a,*v;
 19:   PetscScalar     s1,*x,*b;

 22:   VecCopy(bb,xx);
 23:   VecGetArray(bb,&b);
 24:   VecGetArray(xx,&x);
 25: 
 26:   /* forward solve the U^T */
 27:   for (i=0; i<n; i++) {

 29:     v     = aa + diag[i];
 30:     /* multiply by the inverse of the block diagonal */
 31:     s1    = (*v++)*x[i];
 32:     vi    = aj + diag[i] + 1;
 33:     nz    = ai[i+1] - diag[i] - 1;
 34:     while (nz--) {
 35:       x[*vi++]  -= (*v++)*s1;
 36:     }
 37:     x[i]   = s1;
 38:   }
 39:   /* backward solve the L^T */
 40:   for (i=n-1; i>=0; i--){
 41:     v    = aa + diag[i] - 1;
 42:     vi   = aj + diag[i] - 1;
 43:     nz   = diag[i] - ai[i];
 44:     s1   = x[i];
 45:     while (nz--) {
 46:       x[*vi--]   -=  (*v--)*s1;
 47:     }
 48:   }
 49:   VecRestoreArray(bb,&b);
 50:   VecRestoreArray(xx,&x);
 51:   PetscLogFlops(2*(a->nz) - A->n);
 52:   return(0);
 53: }

 55: #undef __FUNCT__  
 57: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 58: {
 59:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 60:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 61:   int             *diag = a->diag,oidx;
 62:   MatScalar       *aa=a->a,*v;
 63:   PetscScalar     s1,s2,x1,x2;
 64:   PetscScalar     *x,*b;

 67:   VecCopy(bb,xx);
 68:   VecGetArray(bb,&b);
 69:   VecGetArray(xx,&x);

 71:   /* forward solve the U^T */
 72:   idx = 0;
 73:   for (i=0; i<n; i++) {

 75:     v     = aa + 4*diag[i];
 76:     /* multiply by the inverse of the block diagonal */
 77:     x1 = x[idx];   x2 = x[1+idx];
 78:     s1 = v[0]*x1  +  v[1]*x2;
 79:     s2 = v[2]*x1  +  v[3]*x2;
 80:     v += 4;

 82:     vi    = aj + diag[i] + 1;
 83:     nz    = ai[i+1] - diag[i] - 1;
 84:     while (nz--) {
 85:       oidx = 2*(*vi++);
 86:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 87:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 88:       v  += 4;
 89:     }
 90:     x[idx]   = s1;x[1+idx] = s2;
 91:     idx += 2;
 92:   }
 93:   /* backward solve the L^T */
 94:   for (i=n-1; i>=0; i--){
 95:     v    = aa + 4*diag[i] - 4;
 96:     vi   = aj + diag[i] - 1;
 97:     nz   = diag[i] - ai[i];
 98:     idt  = 2*i;
 99:     s1   = x[idt];  s2 = x[1+idt];
100:     while (nz--) {
101:       idx   = 2*(*vi--);
102:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104:       v -= 4;
105:     }
106:   }
107:   VecRestoreArray(bb,&b);
108:   VecRestoreArray(xx,&x);
109:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110:   return(0);
111: }

113: #undef __FUNCT__  
115: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116: {
117:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119:   int             *diag = a->diag,oidx;
120:   MatScalar       *aa=a->a,*v;
121:   PetscScalar     s1,s2,s3,x1,x2,x3;
122:   PetscScalar     *x,*b;

125:   VecCopy(bb,xx);
126:   VecGetArray(bb,&b);
127:   VecGetArray(xx,&x);

129:   /* forward solve the U^T */
130:   idx = 0;
131:   for (i=0; i<n; i++) {

133:     v     = aa + 9*diag[i];
134:     /* multiply by the inverse of the block diagonal */
135:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139:     v += 9;

141:     vi    = aj + diag[i] + 1;
142:     nz    = ai[i+1] - diag[i] - 1;
143:     while (nz--) {
144:       oidx = 3*(*vi++);
145:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148:       v  += 9;
149:     }
150:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151:     idx += 3;
152:   }
153:   /* backward solve the L^T */
154:   for (i=n-1; i>=0; i--){
155:     v    = aa + 9*diag[i] - 9;
156:     vi   = aj + diag[i] - 1;
157:     nz   = diag[i] - ai[i];
158:     idt  = 3*i;
159:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160:     while (nz--) {
161:       idx   = 3*(*vi--);
162:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165:       v -= 9;
166:     }
167:   }
168:   VecRestoreArray(bb,&b);
169:   VecRestoreArray(xx,&x);
170:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171:   return(0);
172: }

174: #undef __FUNCT__  
176: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177: {
178:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180:   int             *diag = a->diag,oidx;
181:   MatScalar       *aa=a->a,*v;
182:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
183:   PetscScalar     *x,*b;

186:   VecCopy(bb,xx);
187:   VecGetArray(bb,&b);
188:   VecGetArray(xx,&x);

190:   /* forward solve the U^T */
191:   idx = 0;
192:   for (i=0; i<n; i++) {

194:     v     = aa + 16*diag[i];
195:     /* multiply by the inverse of the block diagonal */
196:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201:     v += 16;

203:     vi    = aj + diag[i] + 1;
204:     nz    = ai[i+1] - diag[i] - 1;
205:     while (nz--) {
206:       oidx = 4*(*vi++);
207:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211:       v  += 16;
212:     }
213:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214:     idx += 4;
215:   }
216:   /* backward solve the L^T */
217:   for (i=n-1; i>=0; i--){
218:     v    = aa + 16*diag[i] - 16;
219:     vi   = aj + diag[i] - 1;
220:     nz   = diag[i] - ai[i];
221:     idt  = 4*i;
222:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223:     while (nz--) {
224:       idx   = 4*(*vi--);
225:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229:       v -= 16;
230:     }
231:   }
232:   VecRestoreArray(bb,&b);
233:   VecRestoreArray(xx,&x);
234:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235:   return(0);
236: }

238: #undef __FUNCT__  
240: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241: {
242:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244:   int             *diag = a->diag,oidx;
245:   MatScalar       *aa=a->a,*v;
246:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247:   PetscScalar     *x,*b;

250:   VecCopy(bb,xx);
251:   VecGetArray(bb,&b);
252:   VecGetArray(xx,&x);

254:   /* forward solve the U^T */
255:   idx = 0;
256:   for (i=0; i<n; i++) {

258:     v     = aa + 25*diag[i];
259:     /* multiply by the inverse of the block diagonal */
260:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266:     v += 25;

268:     vi    = aj + diag[i] + 1;
269:     nz    = ai[i+1] - diag[i] - 1;
270:     while (nz--) {
271:       oidx = 5*(*vi++);
272:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277:       v  += 25;
278:     }
279:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280:     idx += 5;
281:   }
282:   /* backward solve the L^T */
283:   for (i=n-1; i>=0; i--){
284:     v    = aa + 25*diag[i] - 25;
285:     vi   = aj + diag[i] - 1;
286:     nz   = diag[i] - ai[i];
287:     idt  = 5*i;
288:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289:     while (nz--) {
290:       idx   = 5*(*vi--);
291:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296:       v -= 25;
297:     }
298:   }
299:   VecRestoreArray(bb,&b);
300:   VecRestoreArray(xx,&x);
301:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302:   return(0);
303: }

305: #undef __FUNCT__  
307: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308: {
309:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311:   int             *diag = a->diag,oidx;
312:   MatScalar       *aa=a->a,*v;
313:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314:   PetscScalar     *x,*b;

317:   VecCopy(bb,xx);
318:   VecGetArray(bb,&b);
319:   VecGetArray(xx,&x);

321:   /* forward solve the U^T */
322:   idx = 0;
323:   for (i=0; i<n; i++) {

325:     v     = aa + 36*diag[i];
326:     /* multiply by the inverse of the block diagonal */
327:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328:     x6    = x[5+idx];
329:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335:     v += 36;

337:     vi    = aj + diag[i] + 1;
338:     nz    = ai[i+1] - diag[i] - 1;
339:     while (nz--) {
340:       oidx = 6*(*vi++);
341:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347:       v  += 36;
348:     }
349:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350:     x[5+idx] = s6;
351:     idx += 6;
352:   }
353:   /* backward solve the L^T */
354:   for (i=n-1; i>=0; i--){
355:     v    = aa + 36*diag[i] - 36;
356:     vi   = aj + diag[i] - 1;
357:     nz   = diag[i] - ai[i];
358:     idt  = 6*i;
359:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360:     s6 = x[5+idt];
361:     while (nz--) {
362:       idx   = 6*(*vi--);
363:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369:       v -= 36;
370:     }
371:   }
372:   VecRestoreArray(bb,&b);
373:   VecRestoreArray(xx,&x);
374:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375:   return(0);
376: }

378: #undef __FUNCT__  
380: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381: {
382:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384:   int             *diag = a->diag,oidx;
385:   MatScalar       *aa=a->a,*v;
386:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387:   PetscScalar     *x,*b;

390:   VecCopy(bb,xx);
391:   VecGetArray(bb,&b);
392:   VecGetArray(xx,&x);

394:   /* forward solve the U^T */
395:   idx = 0;
396:   for (i=0; i<n; i++) {

398:     v     = aa + 49*diag[i];
399:     /* multiply by the inverse of the block diagonal */
400:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401:     x6    = x[5+idx]; x7 = x[6+idx];
402:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409:     v += 49;

411:     vi    = aj + diag[i] + 1;
412:     nz    = ai[i+1] - diag[i] - 1;
413:     while (nz--) {
414:       oidx = 7*(*vi++);
415:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422:       v  += 49;
423:     }
424:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425:     x[5+idx] = s6;x[6+idx] = s7;
426:     idx += 7;
427:   }
428:   /* backward solve the L^T */
429:   for (i=n-1; i>=0; i--){
430:     v    = aa + 49*diag[i] - 49;
431:     vi   = aj + diag[i] - 1;
432:     nz   = diag[i] - ai[i];
433:     idt  = 7*i;
434:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435:     s6 = x[5+idt];s7 = x[6+idt];
436:     while (nz--) {
437:       idx   = 7*(*vi--);
438:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445:       v -= 49;
446:     }
447:   }
448:   VecRestoreArray(bb,&b);
449:   VecRestoreArray(xx,&x);
450:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451:   return(0);
452: }

454: /*---------------------------------------------------------------------------------------------*/
455: #undef __FUNCT__  
457: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458: {
459:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460:   IS              iscol=a->col,isrow=a->row;
461:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462:   int             *diag = a->diag;
463:   MatScalar       *aa=a->a,*v;
464:   PetscScalar     s1,*x,*b,*t;

467:   VecGetArray(bb,&b);
468:   VecGetArray(xx,&x);
469:   t  = a->solve_work;

471:   ISGetIndices(isrow,&rout); r = rout;
472:   ISGetIndices(iscol,&cout); c = cout;

474:   /* copy the b into temp work space according to permutation */
475:   for (i=0; i<n; i++) {
476:     t[i] = b[c[i]];
477:   }

479:   /* forward solve the U^T */
480:   for (i=0; i<n; i++) {

482:     v     = aa + diag[i];
483:     /* multiply by the inverse of the block diagonal */
484:     s1    = (*v++)*t[i];
485:     vi    = aj + diag[i] + 1;
486:     nz    = ai[i+1] - diag[i] - 1;
487:     while (nz--) {
488:       t[*vi++]  -= (*v++)*s1;
489:     }
490:     t[i]   = s1;
491:   }
492:   /* backward solve the L^T */
493:   for (i=n-1; i>=0; i--){
494:     v    = aa + diag[i] - 1;
495:     vi   = aj + diag[i] - 1;
496:     nz   = diag[i] - ai[i];
497:     s1   = t[i];
498:     while (nz--) {
499:       t[*vi--]   -=  (*v--)*s1;
500:     }
501:   }

503:   /* copy t into x according to permutation */
504:   for (i=0; i<n; i++) {
505:     x[r[i]]   = t[i];
506:   }

508:   ISRestoreIndices(isrow,&rout);
509:   ISRestoreIndices(iscol,&cout);
510:   VecRestoreArray(bb,&b);
511:   VecRestoreArray(xx,&x);
512:   PetscLogFlops(2*(a->nz) - A->n);
513:   return(0);
514: }

516: #undef __FUNCT__  
518: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519: {
520:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521:   IS              iscol=a->col,isrow=a->row;
522:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523:   int             *diag = a->diag,ii,ic,ir,oidx;
524:   MatScalar       *aa=a->a,*v;
525:   PetscScalar     s1,s2,x1,x2;
526:   PetscScalar     *x,*b,*t;

529:   VecGetArray(bb,&b);
530:   VecGetArray(xx,&x);
531:   t  = a->solve_work;

533:   ISGetIndices(isrow,&rout); r = rout;
534:   ISGetIndices(iscol,&cout); c = cout;

536:   /* copy the b into temp work space according to permutation */
537:   ii = 0;
538:   for (i=0; i<n; i++) {
539:     ic      = 2*c[i];
540:     t[ii]   = b[ic];
541:     t[ii+1] = b[ic+1];
542:     ii += 2;
543:   }

545:   /* forward solve the U^T */
546:   idx = 0;
547:   for (i=0; i<n; i++) {

549:     v     = aa + 4*diag[i];
550:     /* multiply by the inverse of the block diagonal */
551:     x1    = t[idx];   x2 = t[1+idx];
552:     s1 = v[0]*x1  +  v[1]*x2;
553:     s2 = v[2]*x1  +  v[3]*x2;
554:     v += 4;

556:     vi    = aj + diag[i] + 1;
557:     nz    = ai[i+1] - diag[i] - 1;
558:     while (nz--) {
559:       oidx = 2*(*vi++);
560:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562:       v  += 4;
563:     }
564:     t[idx]   = s1;t[1+idx] = s2;
565:     idx += 2;
566:   }
567:   /* backward solve the L^T */
568:   for (i=n-1; i>=0; i--){
569:     v    = aa + 4*diag[i] - 4;
570:     vi   = aj + diag[i] - 1;
571:     nz   = diag[i] - ai[i];
572:     idt  = 2*i;
573:     s1 = t[idt];  s2 = t[1+idt];
574:     while (nz--) {
575:       idx   = 2*(*vi--);
576:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578:       v -= 4;
579:     }
580:   }

582:   /* copy t into x according to permutation */
583:   ii = 0;
584:   for (i=0; i<n; i++) {
585:     ir      = 2*r[i];
586:     x[ir]   = t[ii];
587:     x[ir+1] = t[ii+1];
588:     ii += 2;
589:   }

591:   ISRestoreIndices(isrow,&rout);
592:   ISRestoreIndices(iscol,&cout);
593:   VecRestoreArray(bb,&b);
594:   VecRestoreArray(xx,&x);
595:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596:   return(0);
597: }

599: #undef __FUNCT__  
601: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602: {
603:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604:   IS              iscol=a->col,isrow=a->row;
605:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606:   int             *diag = a->diag,ii,ic,ir,oidx;
607:   MatScalar       *aa=a->a,*v;
608:   PetscScalar     s1,s2,s3,x1,x2,x3;
609:   PetscScalar     *x,*b,*t;

612:   VecGetArray(bb,&b);
613:   VecGetArray(xx,&x);
614:   t  = a->solve_work;

616:   ISGetIndices(isrow,&rout); r = rout;
617:   ISGetIndices(iscol,&cout); c = cout;

619:   /* copy the b into temp work space according to permutation */
620:   ii = 0;
621:   for (i=0; i<n; i++) {
622:     ic      = 3*c[i];
623:     t[ii]   = b[ic];
624:     t[ii+1] = b[ic+1];
625:     t[ii+2] = b[ic+2];
626:     ii += 3;
627:   }

629:   /* forward solve the U^T */
630:   idx = 0;
631:   for (i=0; i<n; i++) {

633:     v     = aa + 9*diag[i];
634:     /* multiply by the inverse of the block diagonal */
635:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639:     v += 9;

641:     vi    = aj + diag[i] + 1;
642:     nz    = ai[i+1] - diag[i] - 1;
643:     while (nz--) {
644:       oidx = 3*(*vi++);
645:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648:       v  += 9;
649:     }
650:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651:     idx += 3;
652:   }
653:   /* backward solve the L^T */
654:   for (i=n-1; i>=0; i--){
655:     v    = aa + 9*diag[i] - 9;
656:     vi   = aj + diag[i] - 1;
657:     nz   = diag[i] - ai[i];
658:     idt  = 3*i;
659:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660:     while (nz--) {
661:       idx   = 3*(*vi--);
662:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665:       v -= 9;
666:     }
667:   }

669:   /* copy t into x according to permutation */
670:   ii = 0;
671:   for (i=0; i<n; i++) {
672:     ir      = 3*r[i];
673:     x[ir]   = t[ii];
674:     x[ir+1] = t[ii+1];
675:     x[ir+2] = t[ii+2];
676:     ii += 3;
677:   }

679:   ISRestoreIndices(isrow,&rout);
680:   ISRestoreIndices(iscol,&cout);
681:   VecRestoreArray(bb,&b);
682:   VecRestoreArray(xx,&x);
683:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684:   return(0);
685: }

687: #undef __FUNCT__  
689: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690: {
691:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692:   IS              iscol=a->col,isrow=a->row;
693:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694:   int             *diag = a->diag,ii,ic,ir,oidx;
695:   MatScalar       *aa=a->a,*v;
696:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
697:   PetscScalar     *x,*b,*t;

700:   VecGetArray(bb,&b);
701:   VecGetArray(xx,&x);
702:   t  = a->solve_work;

704:   ISGetIndices(isrow,&rout); r = rout;
705:   ISGetIndices(iscol,&cout); c = cout;

707:   /* copy the b into temp work space according to permutation */
708:   ii = 0;
709:   for (i=0; i<n; i++) {
710:     ic      = 4*c[i];
711:     t[ii]   = b[ic];
712:     t[ii+1] = b[ic+1];
713:     t[ii+2] = b[ic+2];
714:     t[ii+3] = b[ic+3];
715:     ii += 4;
716:   }

718:   /* forward solve the U^T */
719:   idx = 0;
720:   for (i=0; i<n; i++) {

722:     v     = aa + 16*diag[i];
723:     /* multiply by the inverse of the block diagonal */
724:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729:     v += 16;

731:     vi    = aj + diag[i] + 1;
732:     nz    = ai[i+1] - diag[i] - 1;
733:     while (nz--) {
734:       oidx = 4*(*vi++);
735:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739:       v  += 16;
740:     }
741:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742:     idx += 4;
743:   }
744:   /* backward solve the L^T */
745:   for (i=n-1; i>=0; i--){
746:     v    = aa + 16*diag[i] - 16;
747:     vi   = aj + diag[i] - 1;
748:     nz   = diag[i] - ai[i];
749:     idt  = 4*i;
750:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751:     while (nz--) {
752:       idx   = 4*(*vi--);
753:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757:       v -= 16;
758:     }
759:   }

761:   /* copy t into x according to permutation */
762:   ii = 0;
763:   for (i=0; i<n; i++) {
764:     ir      = 4*r[i];
765:     x[ir]   = t[ii];
766:     x[ir+1] = t[ii+1];
767:     x[ir+2] = t[ii+2];
768:     x[ir+3] = t[ii+3];
769:     ii += 4;
770:   }

772:   ISRestoreIndices(isrow,&rout);
773:   ISRestoreIndices(iscol,&cout);
774:   VecRestoreArray(bb,&b);
775:   VecRestoreArray(xx,&x);
776:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777:   return(0);
778: }

780: #undef __FUNCT__  
782: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783: {
784:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785:   IS              iscol=a->col,isrow=a->row;
786:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787:   int             *diag = a->diag,ii,ic,ir,oidx;
788:   MatScalar       *aa=a->a,*v;
789:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790:   PetscScalar     *x,*b,*t;

793:   VecGetArray(bb,&b);
794:   VecGetArray(xx,&x);
795:   t  = a->solve_work;

797:   ISGetIndices(isrow,&rout); r = rout;
798:   ISGetIndices(iscol,&cout); c = cout;

800:   /* copy the b into temp work space according to permutation */
801:   ii = 0;
802:   for (i=0; i<n; i++) {
803:     ic      = 5*c[i];
804:     t[ii]   = b[ic];
805:     t[ii+1] = b[ic+1];
806:     t[ii+2] = b[ic+2];
807:     t[ii+3] = b[ic+3];
808:     t[ii+4] = b[ic+4];
809:     ii += 5;
810:   }

812:   /* forward solve the U^T */
813:   idx = 0;
814:   for (i=0; i<n; i++) {

816:     v     = aa + 25*diag[i];
817:     /* multiply by the inverse of the block diagonal */
818:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824:     v += 25;

826:     vi    = aj + diag[i] + 1;
827:     nz    = ai[i+1] - diag[i] - 1;
828:     while (nz--) {
829:       oidx = 5*(*vi++);
830:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835:       v  += 25;
836:     }
837:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838:     idx += 5;
839:   }
840:   /* backward solve the L^T */
841:   for (i=n-1; i>=0; i--){
842:     v    = aa + 25*diag[i] - 25;
843:     vi   = aj + diag[i] - 1;
844:     nz   = diag[i] - ai[i];
845:     idt  = 5*i;
846:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847:     while (nz--) {
848:       idx   = 5*(*vi--);
849:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854:       v -= 25;
855:     }
856:   }

858:   /* copy t into x according to permutation */
859:   ii = 0;
860:   for (i=0; i<n; i++) {
861:     ir      = 5*r[i];
862:     x[ir]   = t[ii];
863:     x[ir+1] = t[ii+1];
864:     x[ir+2] = t[ii+2];
865:     x[ir+3] = t[ii+3];
866:     x[ir+4] = t[ii+4];
867:     ii += 5;
868:   }

870:   ISRestoreIndices(isrow,&rout);
871:   ISRestoreIndices(iscol,&cout);
872:   VecRestoreArray(bb,&b);
873:   VecRestoreArray(xx,&x);
874:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875:   return(0);
876: }

878: #undef __FUNCT__  
880: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881: {
882:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883:   IS              iscol=a->col,isrow=a->row;
884:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885:   int             *diag = a->diag,ii,ic,ir,oidx;
886:   MatScalar       *aa=a->a,*v;
887:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888:   PetscScalar     *x,*b,*t;

891:   VecGetArray(bb,&b);
892:   VecGetArray(xx,&x);
893:   t  = a->solve_work;

895:   ISGetIndices(isrow,&rout); r = rout;
896:   ISGetIndices(iscol,&cout); c = cout;

898:   /* copy the b into temp work space according to permutation */
899:   ii = 0;
900:   for (i=0; i<n; i++) {
901:     ic      = 6*c[i];
902:     t[ii]   = b[ic];
903:     t[ii+1] = b[ic+1];
904:     t[ii+2] = b[ic+2];
905:     t[ii+3] = b[ic+3];
906:     t[ii+4] = b[ic+4];
907:     t[ii+5] = b[ic+5];
908:     ii += 6;
909:   }

911:   /* forward solve the U^T */
912:   idx = 0;
913:   for (i=0; i<n; i++) {

915:     v     = aa + 36*diag[i];
916:     /* multiply by the inverse of the block diagonal */
917:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918:     x6    = t[5+idx];
919:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925:     v += 36;

927:     vi    = aj + diag[i] + 1;
928:     nz    = ai[i+1] - diag[i] - 1;
929:     while (nz--) {
930:       oidx = 6*(*vi++);
931:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937:       v  += 36;
938:     }
939:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940:     t[5+idx] = s6;
941:     idx += 6;
942:   }
943:   /* backward solve the L^T */
944:   for (i=n-1; i>=0; i--){
945:     v    = aa + 36*diag[i] - 36;
946:     vi   = aj + diag[i] - 1;
947:     nz   = diag[i] - ai[i];
948:     idt  = 6*i;
949:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950:     s6 = t[5+idt];
951:     while (nz--) {
952:       idx   = 6*(*vi--);
953:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959:       v -= 36;
960:     }
961:   }

963:   /* copy t into x according to permutation */
964:   ii = 0;
965:   for (i=0; i<n; i++) {
966:     ir      = 6*r[i];
967:     x[ir]   = t[ii];
968:     x[ir+1] = t[ii+1];
969:     x[ir+2] = t[ii+2];
970:     x[ir+3] = t[ii+3];
971:     x[ir+4] = t[ii+4];
972:     x[ir+5] = t[ii+5];
973:     ii += 6;
974:   }

976:   ISRestoreIndices(isrow,&rout);
977:   ISRestoreIndices(iscol,&cout);
978:   VecRestoreArray(bb,&b);
979:   VecRestoreArray(xx,&x);
980:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981:   return(0);
982: }

984: #undef __FUNCT__  
986: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987: {
988:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989:   IS              iscol=a->col,isrow=a->row;
990:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991:   int             *diag = a->diag,ii,ic,ir,oidx;
992:   MatScalar       *aa=a->a,*v;
993:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994:   PetscScalar     *x,*b,*t;

997:   VecGetArray(bb,&b);
998:   VecGetArray(xx,&x);
999:   t  = a->solve_work;

1001:   ISGetIndices(isrow,&rout); r = rout;
1002:   ISGetIndices(iscol,&cout); c = cout;

1004:   /* copy the b into temp work space according to permutation */
1005:   ii = 0;
1006:   for (i=0; i<n; i++) {
1007:     ic      = 7*c[i];
1008:     t[ii]   = b[ic];
1009:     t[ii+1] = b[ic+1];
1010:     t[ii+2] = b[ic+2];
1011:     t[ii+3] = b[ic+3];
1012:     t[ii+4] = b[ic+4];
1013:     t[ii+5] = b[ic+5];
1014:     t[ii+6] = b[ic+6];
1015:     ii += 7;
1016:   }

1018:   /* forward solve the U^T */
1019:   idx = 0;
1020:   for (i=0; i<n; i++) {

1022:     v     = aa + 49*diag[i];
1023:     /* multiply by the inverse of the block diagonal */
1024:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025:     x6    = t[5+idx]; x7 = t[6+idx];
1026:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033:     v += 49;

1035:     vi    = aj + diag[i] + 1;
1036:     nz    = ai[i+1] - diag[i] - 1;
1037:     while (nz--) {
1038:       oidx = 7*(*vi++);
1039:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046:       v  += 49;
1047:     }
1048:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049:     t[5+idx] = s6;t[6+idx] = s7;
1050:     idx += 7;
1051:   }
1052:   /* backward solve the L^T */
1053:   for (i=n-1; i>=0; i--){
1054:     v    = aa + 49*diag[i] - 49;
1055:     vi   = aj + diag[i] - 1;
1056:     nz   = diag[i] - ai[i];
1057:     idt  = 7*i;
1058:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059:     s6 = t[5+idt];s7 = t[6+idt];
1060:     while (nz--) {
1061:       idx   = 7*(*vi--);
1062:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069:       v -= 49;
1070:     }
1071:   }

1073:   /* copy t into x according to permutation */
1074:   ii = 0;
1075:   for (i=0; i<n; i++) {
1076:     ir      = 7*r[i];
1077:     x[ir]   = t[ii];
1078:     x[ir+1] = t[ii+1];
1079:     x[ir+2] = t[ii+2];
1080:     x[ir+3] = t[ii+3];
1081:     x[ir+4] = t[ii+4];
1082:     x[ir+5] = t[ii+5];
1083:     x[ir+6] = t[ii+6];
1084:     ii += 7;
1085:   }

1087:   ISRestoreIndices(isrow,&rout);
1088:   ISRestoreIndices(iscol,&cout);
1089:   VecRestoreArray(bb,&b);
1090:   VecRestoreArray(xx,&x);
1091:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092:   return(0);
1093: }

1095: /* ----------------------------------------------------------- */
1096: #undef __FUNCT__  
1098: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099: {
1100:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1101:   IS              iscol=a->col,isrow=a->row;
1102:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103:   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104:   MatScalar       *aa=a->a,*v;
1105:   PetscScalar     *x,*b,*s,*t,*ls;

1108:   VecGetArray(bb,&b);
1109:   VecGetArray(xx,&x);
1110:   t  = a->solve_work;

1112:   ISGetIndices(isrow,&rout); r = rout;
1113:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1115:   /* forward solve the lower triangular */
1116:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1117:   for (i=1; i<n; i++) {
1118:     v   = aa + bs2*ai[i];
1119:     vi  = aj + ai[i];
1120:     nz  = a->diag[i] - ai[i];
1121:     s = t + bs*i;
1122:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1123:     while (nz--) {
1124:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125:       v += bs2;
1126:     }
1127:   }
1128:   /* backward solve the upper triangular */
1129:   ls = a->solve_work + A->n;
1130:   for (i=n-1; i>=0; i--){
1131:     v   = aa + bs2*(a->diag[i] + 1);
1132:     vi  = aj + a->diag[i] + 1;
1133:     nz  = ai[i+1] - a->diag[i] - 1;
1134:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1135:     while (nz--) {
1136:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137:       v += bs2;
1138:     }
1139:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1141:   }

1143:   ISRestoreIndices(isrow,&rout);
1144:   ISRestoreIndices(iscol,&cout);
1145:   VecRestoreArray(bb,&b);
1146:   VecRestoreArray(xx,&x);
1147:   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148:   return(0);
1149: }

1151: #undef __FUNCT__  
1153: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154: {
1155:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1156:   IS              iscol=a->col,isrow=a->row;
1157:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158:   int             *diag = a->diag;
1159:   MatScalar       *aa=a->a,*v;
1160:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161:   PetscScalar     *x,*b,*t;

1164:   VecGetArray(bb,&b);
1165:   VecGetArray(xx,&x);
1166:   t  = a->solve_work;

1168:   ISGetIndices(isrow,&rout); r = rout;
1169:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1171:   /* forward solve the lower triangular */
1172:   idx    = 7*(*r++);
1173:   t[0] = b[idx];   t[1] = b[1+idx];
1174:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175:   t[5] = b[5+idx]; t[6] = b[6+idx];

1177:   for (i=1; i<n; i++) {
1178:     v     = aa + 49*ai[i];
1179:     vi    = aj + ai[i];
1180:     nz    = diag[i] - ai[i];
1181:     idx   = 7*(*r++);
1182:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184:     while (nz--) {
1185:       idx   = 7*(*vi++);
1186:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187:       x4    = t[3+idx];x5 = t[4+idx];
1188:       x6    = t[5+idx];x7 = t[6+idx];
1189:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196:       v += 49;
1197:     }
1198:     idx = 7*i;
1199:     t[idx]   = s1;t[1+idx] = s2;
1200:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201:     t[5+idx] = s6;t[6+idx] = s7;
1202:   }
1203:   /* backward solve the upper triangular */
1204:   for (i=n-1; i>=0; i--){
1205:     v    = aa + 49*diag[i] + 49;
1206:     vi   = aj + diag[i] + 1;
1207:     nz   = ai[i+1] - diag[i] - 1;
1208:     idt  = 7*i;
1209:     s1 = t[idt];  s2 = t[1+idt];
1210:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211:     s6 = t[5+idt];s7 = t[6+idt];
1212:     while (nz--) {
1213:       idx   = 7*(*vi++);
1214:       x1    = t[idx];   x2 = t[1+idx];
1215:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216:       x6    = t[5+idx]; x7 = t[6+idx];
1217:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224:       v += 49;
1225:     }
1226:     idc = 7*(*c--);
1227:     v   = aa + 49*diag[i];
1228:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242:   }

1244:   ISRestoreIndices(isrow,&rout);
1245:   ISRestoreIndices(iscol,&cout);
1246:   VecRestoreArray(bb,&b);
1247:   VecRestoreArray(xx,&x);
1248:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249:   return(0);
1250: }

1252: #undef __FUNCT__  
1254: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255: {
1256:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1257:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258:   int             ierr,*diag = a->diag,jdx;
1259:   MatScalar       *aa=a->a,*v;
1260:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1263:   VecGetArray(bb,&b);
1264:   VecGetArray(xx,&x);
1265:   /* forward solve the lower triangular */
1266:   idx    = 0;
1267:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1268:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269:   x[6] = b[6+idx];
1270:   for (i=1; i<n; i++) {
1271:     v     =  aa + 49*ai[i];
1272:     vi    =  aj + ai[i];
1273:     nz    =  diag[i] - ai[i];
1274:     idx   =  7*i;
1275:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277:     s7  =  b[6+idx];
1278:     while (nz--) {
1279:       jdx   = 7*(*vi++);
1280:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1281:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282:       x7    = x[6+jdx];
1283:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290:       v += 49;
1291:      }
1292:     x[idx]   = s1;
1293:     x[1+idx] = s2;
1294:     x[2+idx] = s3;
1295:     x[3+idx] = s4;
1296:     x[4+idx] = s5;
1297:     x[5+idx] = s6;
1298:     x[6+idx] = s7;
1299:   }
1300:   /* backward solve the upper triangular */
1301:   for (i=n-1; i>=0; i--){
1302:     v    = aa + 49*diag[i] + 49;
1303:     vi   = aj + diag[i] + 1;
1304:     nz   = ai[i+1] - diag[i] - 1;
1305:     idt  = 7*i;
1306:     s1 = x[idt];   s2 = x[1+idt];
1307:     s3 = x[2+idt]; s4 = x[3+idt];
1308:     s5 = x[4+idt]; s6 = x[5+idt];
1309:     s7 = x[6+idt];
1310:     while (nz--) {
1311:       idx   = 7*(*vi++);
1312:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1313:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314:       x7    = x[6+idx];
1315:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322:       v += 49;
1323:     }
1324:     v        = aa + 49*diag[i];
1325:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339:   }

1341:   VecRestoreArray(bb,&b);
1342:   VecRestoreArray(xx,&x);
1343:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344:   return(0);
1345: }

1347: #undef __FUNCT__  
1349: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350: {
1351:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1352:   IS              iscol=a->col,isrow=a->row;
1353:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354:   int             *diag = a->diag;
1355:   MatScalar       *aa=a->a,*v;
1356:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1359:   VecGetArray(bb,&b);
1360:   VecGetArray(xx,&x);
1361:   t  = a->solve_work;

1363:   ISGetIndices(isrow,&rout); r = rout;
1364:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1366:   /* forward solve the lower triangular */
1367:   idx    = 6*(*r++);
1368:   t[0] = b[idx];   t[1] = b[1+idx];
1369:   t[2] = b[2+idx]; t[3] = b[3+idx];
1370:   t[4] = b[4+idx]; t[5] = b[5+idx];
1371:   for (i=1; i<n; i++) {
1372:     v     = aa + 36*ai[i];
1373:     vi    = aj + ai[i];
1374:     nz    = diag[i] - ai[i];
1375:     idx   = 6*(*r++);
1376:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377:     s5  = b[4+idx]; s6 = b[5+idx];
1378:     while (nz--) {
1379:       idx   = 6*(*vi++);
1380:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388:       v += 36;
1389:     }
1390:     idx = 6*i;
1391:     t[idx]   = s1;t[1+idx] = s2;
1392:     t[2+idx] = s3;t[3+idx] = s4;
1393:     t[4+idx] = s5;t[5+idx] = s6;
1394:   }
1395:   /* backward solve the upper triangular */
1396:   for (i=n-1; i>=0; i--){
1397:     v    = aa + 36*diag[i] + 36;
1398:     vi   = aj + diag[i] + 1;
1399:     nz   = ai[i+1] - diag[i] - 1;
1400:     idt  = 6*i;
1401:     s1 = t[idt];  s2 = t[1+idt];
1402:     s3 = t[2+idt];s4 = t[3+idt];
1403:     s5 = t[4+idt];s6 = t[5+idt];
1404:     while (nz--) {
1405:       idx   = 6*(*vi++);
1406:       x1    = t[idx];   x2 = t[1+idx];
1407:       x3    = t[2+idx]; x4 = t[3+idx];
1408:       x5    = t[4+idx]; x6 = t[5+idx];
1409:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415:       v += 36;
1416:     }
1417:     idc = 6*(*c--);
1418:     v   = aa + 36*diag[i];
1419:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1431:   }

1433:   ISRestoreIndices(isrow,&rout);
1434:   ISRestoreIndices(iscol,&cout);
1435:   VecRestoreArray(bb,&b);
1436:   VecRestoreArray(xx,&x);
1437:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438:   return(0);
1439: }

1441: #undef __FUNCT__  
1443: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444: {
1445:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1446:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447:   int             ierr,*diag = a->diag,jdx;
1448:   MatScalar       *aa=a->a,*v;
1449:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1452:   VecGetArray(bb,&b);
1453:   VecGetArray(xx,&x);
1454:   /* forward solve the lower triangular */
1455:   idx    = 0;
1456:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1457:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458:   for (i=1; i<n; i++) {
1459:     v     =  aa + 36*ai[i];
1460:     vi    =  aj + ai[i];
1461:     nz    =  diag[i] - ai[i];
1462:     idx   =  6*i;
1463:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465:     while (nz--) {
1466:       jdx   = 6*(*vi++);
1467:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1468:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475:       v += 36;
1476:      }
1477:     x[idx]   = s1;
1478:     x[1+idx] = s2;
1479:     x[2+idx] = s3;
1480:     x[3+idx] = s4;
1481:     x[4+idx] = s5;
1482:     x[5+idx] = s6;
1483:   }
1484:   /* backward solve the upper triangular */
1485:   for (i=n-1; i>=0; i--){
1486:     v    = aa + 36*diag[i] + 36;
1487:     vi   = aj + diag[i] + 1;
1488:     nz   = ai[i+1] - diag[i] - 1;
1489:     idt  = 6*i;
1490:     s1 = x[idt];   s2 = x[1+idt];
1491:     s3 = x[2+idt]; s4 = x[3+idt];
1492:     s5 = x[4+idt]; s6 = x[5+idt];
1493:     while (nz--) {
1494:       idx   = 6*(*vi++);
1495:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1496:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503:       v += 36;
1504:     }
1505:     v        = aa + 36*diag[i];
1506:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512:   }

1514:   VecRestoreArray(bb,&b);
1515:   VecRestoreArray(xx,&x);
1516:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517:   return(0);
1518: }

1520: #undef __FUNCT__  
1522: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523: {
1524:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1525:   IS              iscol=a->col,isrow=a->row;
1526:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527:   int             *diag = a->diag;
1528:   MatScalar       *aa=a->a,*v;
1529:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1532:   VecGetArray(bb,&b);
1533:   VecGetArray(xx,&x);
1534:   t  = a->solve_work;

1536:   ISGetIndices(isrow,&rout); r = rout;
1537:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1539:   /* forward solve the lower triangular */
1540:   idx    = 5*(*r++);
1541:   t[0] = b[idx];   t[1] = b[1+idx];
1542:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543:   for (i=1; i<n; i++) {
1544:     v     = aa + 25*ai[i];
1545:     vi    = aj + ai[i];
1546:     nz    = diag[i] - ai[i];
1547:     idx   = 5*(*r++);
1548:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549:     s5  = b[4+idx];
1550:     while (nz--) {
1551:       idx   = 5*(*vi++);
1552:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553:       x4    = t[3+idx];x5 = t[4+idx];
1554:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559:       v += 25;
1560:     }
1561:     idx = 5*i;
1562:     t[idx]   = s1;t[1+idx] = s2;
1563:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564:   }
1565:   /* backward solve the upper triangular */
1566:   for (i=n-1; i>=0; i--){
1567:     v    = aa + 25*diag[i] + 25;
1568:     vi   = aj + diag[i] + 1;
1569:     nz   = ai[i+1] - diag[i] - 1;
1570:     idt  = 5*i;
1571:     s1 = t[idt];  s2 = t[1+idt];
1572:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573:     while (nz--) {
1574:       idx   = 5*(*vi++);
1575:       x1    = t[idx];   x2 = t[1+idx];
1576:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582:       v += 25;
1583:     }
1584:     idc = 5*(*c--);
1585:     v   = aa + 25*diag[i];
1586:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587:                                  v[15]*s4+v[20]*s5;
1588:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589:                                  v[16]*s4+v[21]*s5;
1590:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591:                                  v[17]*s4+v[22]*s5;
1592:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593:                                  v[18]*s4+v[23]*s5;
1594:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595:                                  v[19]*s4+v[24]*s5;
1596:   }

1598:   ISRestoreIndices(isrow,&rout);
1599:   ISRestoreIndices(iscol,&cout);
1600:   VecRestoreArray(bb,&b);
1601:   VecRestoreArray(xx,&x);
1602:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603:   return(0);
1604: }

1606: #undef __FUNCT__  
1608: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609: {
1610:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1611:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612:   int             ierr,*diag = a->diag,jdx;
1613:   MatScalar       *aa=a->a,*v;
1614:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1617:   VecGetArray(bb,&b);
1618:   VecGetArray(xx,&x);
1619:   /* forward solve the lower triangular */
1620:   idx    = 0;
1621:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622:   for (i=1; i<n; i++) {
1623:     v     =  aa + 25*ai[i];
1624:     vi    =  aj + ai[i];
1625:     nz    =  diag[i] - ai[i];
1626:     idx   =  5*i;
1627:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628:     while (nz--) {
1629:       jdx   = 5*(*vi++);
1630:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1636:       v    += 25;
1637:     }
1638:     x[idx]   = s1;
1639:     x[1+idx] = s2;
1640:     x[2+idx] = s3;
1641:     x[3+idx] = s4;
1642:     x[4+idx] = s5;
1643:   }
1644:   /* backward solve the upper triangular */
1645:   for (i=n-1; i>=0; i--){
1646:     v    = aa + 25*diag[i] + 25;
1647:     vi   = aj + diag[i] + 1;
1648:     nz   = ai[i+1] - diag[i] - 1;
1649:     idt  = 5*i;
1650:     s1 = x[idt];  s2 = x[1+idt];
1651:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652:     while (nz--) {
1653:       idx   = 5*(*vi++);
1654:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1660:       v    += 25;
1661:     }
1662:     v        = aa + 25*diag[i];
1663:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1668:   }

1670:   VecRestoreArray(bb,&b);
1671:   VecRestoreArray(xx,&x);
1672:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673:   return(0);
1674: }

1676: #undef __FUNCT__  
1678: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679: {
1680:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1681:   IS              iscol=a->col,isrow=a->row;
1682:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683:   int             *diag = a->diag;
1684:   MatScalar       *aa=a->a,*v;
1685:   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1688:   VecGetArray(bb,&b);
1689:   VecGetArray(xx,&x);
1690:   t  = a->solve_work;

1692:   ISGetIndices(isrow,&rout); r = rout;
1693:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1695:   /* forward solve the lower triangular */
1696:   idx    = 4*(*r++);
1697:   t[0] = b[idx];   t[1] = b[1+idx];
1698:   t[2] = b[2+idx]; t[3] = b[3+idx];
1699:   for (i=1; i<n; i++) {
1700:     v     = aa + 16*ai[i];
1701:     vi    = aj + ai[i];
1702:     nz    = diag[i] - ai[i];
1703:     idx   = 4*(*r++);
1704:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705:     while (nz--) {
1706:       idx   = 4*(*vi++);
1707:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712:       v    += 16;
1713:     }
1714:     idx        = 4*i;
1715:     t[idx]   = s1;t[1+idx] = s2;
1716:     t[2+idx] = s3;t[3+idx] = s4;
1717:   }
1718:   /* backward solve the upper triangular */
1719:   for (i=n-1; i>=0; i--){
1720:     v    = aa + 16*diag[i] + 16;
1721:     vi   = aj + diag[i] + 1;
1722:     nz   = ai[i+1] - diag[i] - 1;
1723:     idt  = 4*i;
1724:     s1 = t[idt];  s2 = t[1+idt];
1725:     s3 = t[2+idt];s4 = t[3+idt];
1726:     while (nz--) {
1727:       idx   = 4*(*vi++);
1728:       x1    = t[idx];   x2 = t[1+idx];
1729:       x3    = t[2+idx]; x4 = t[3+idx];
1730:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1734:       v += 16;
1735:     }
1736:     idc      = 4*(*c--);
1737:     v        = aa + 16*diag[i];
1738:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742:   }

1744:   ISRestoreIndices(isrow,&rout);
1745:   ISRestoreIndices(iscol,&cout);
1746:   VecRestoreArray(bb,&b);
1747:   VecRestoreArray(xx,&x);
1748:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749:   return(0);
1750: }

1752: #undef __FUNCT__  
1754: int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755: {
1756:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757:   IS              iscol=a->col,isrow=a->row;
1758:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759:   int             *diag = a->diag;
1760:   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761:   PetscScalar     *x,*b;

1764:   VecGetArray(bb,&b);
1765:   VecGetArray(xx,&x);
1766:   t  = (MatScalar *)a->solve_work;

1768:   ISGetIndices(isrow,&rout); r = rout;
1769:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1771:   /* forward solve the lower triangular */
1772:   idx    = 4*(*r++);
1773:   t[0] = (MatScalar)b[idx];
1774:   t[1] = (MatScalar)b[1+idx];
1775:   t[2] = (MatScalar)b[2+idx];
1776:   t[3] = (MatScalar)b[3+idx];
1777:   for (i=1; i<n; i++) {
1778:     v     = aa + 16*ai[i];
1779:     vi    = aj + ai[i];
1780:     nz    = diag[i] - ai[i];
1781:     idx   = 4*(*r++);
1782:     s1 = (MatScalar)b[idx];
1783:     s2 = (MatScalar)b[1+idx];
1784:     s3 = (MatScalar)b[2+idx];
1785:     s4 = (MatScalar)b[3+idx];
1786:     while (nz--) {
1787:       idx   = 4*(*vi++);
1788:       x1  = t[idx];
1789:       x2  = t[1+idx];
1790:       x3  = t[2+idx];
1791:       x4  = t[3+idx];
1792:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796:       v    += 16;
1797:     }
1798:     idx        = 4*i;
1799:     t[idx]   = s1;
1800:     t[1+idx] = s2;
1801:     t[2+idx] = s3;
1802:     t[3+idx] = s4;
1803:   }
1804:   /* backward solve the upper triangular */
1805:   for (i=n-1; i>=0; i--){
1806:     v    = aa + 16*diag[i] + 16;
1807:     vi   = aj + diag[i] + 1;
1808:     nz   = ai[i+1] - diag[i] - 1;
1809:     idt  = 4*i;
1810:     s1 = t[idt];
1811:     s2 = t[1+idt];
1812:     s3 = t[2+idt];
1813:     s4 = t[3+idt];
1814:     while (nz--) {
1815:       idx   = 4*(*vi++);
1816:       x1  = t[idx];
1817:       x2  = t[1+idx];
1818:       x3  = t[2+idx];
1819:       x4  = t[3+idx];
1820:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824:       v += 16;
1825:     }
1826:     idc      = 4*(*c--);
1827:     v        = aa + 16*diag[i];
1828:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832:     x[idc]   = (PetscScalar)t[idt];
1833:     x[1+idc] = (PetscScalar)t[1+idt];
1834:     x[2+idc] = (PetscScalar)t[2+idt];
1835:     x[3+idc] = (PetscScalar)t[3+idt];
1836:  }

1838:   ISRestoreIndices(isrow,&rout);
1839:   ISRestoreIndices(iscol,&cout);
1840:   VecRestoreArray(bb,&b);
1841:   VecRestoreArray(xx,&x);
1842:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843:   return(0);
1844: }

1846: #if defined (PETSC_HAVE_SSE)

1848: #include PETSC_HAVE_SSE

1850: #undef __FUNCT__
1852: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1853: {
1854:   /* 
1855:      Note: This code uses demotion of double
1856:      to float when performing the mixed-mode computation.
1857:      This may not be numerically reasonable for all applications.
1858:   */
1859:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1860:   IS              iscol=a->col,isrow=a->row;
1861:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1862:   int             *diag = a->diag,ai16;
1863:   MatScalar       *aa=a->a,*v;
1864:   PetscScalar     *x,*b,*t;

1866:   /* Make space in temp stack for 16 Byte Aligned arrays */
1867:   float           ssealignedspace[11],*tmps,*tmpx;
1868:   unsigned long   offset;
1869: 
1871:   SSE_SCOPE_BEGIN;

1873:     offset = (unsigned long)ssealignedspace % 16;
1874:     if (offset) offset = (16 - offset)/4;
1875:     tmps = &ssealignedspace[offset];
1876:     tmpx = &ssealignedspace[offset+4];
1877:     PREFETCH_NTA(aa+16*ai[1]);

1879:     VecGetArray(bb,&b);
1880:     VecGetArray(xx,&x);
1881:     t  = a->solve_work;

1883:     ISGetIndices(isrow,&rout); r = rout;
1884:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1886:     /* forward solve the lower triangular */
1887:     idx  = 4*(*r++);
1888:     t[0] = b[idx];   t[1] = b[1+idx];
1889:     t[2] = b[2+idx]; t[3] = b[3+idx];
1890:     v    =  aa + 16*ai[1];

1892:     for (i=1; i<n;) {
1893:       PREFETCH_NTA(&v[8]);
1894:       vi   =  aj      + ai[i];
1895:       nz   =  diag[i] - ai[i];
1896:       idx  =  4*(*r++);

1898:       /* Demote sum from double to float */
1899:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1900:       LOAD_PS(tmps,XMM7);

1902:       while (nz--) {
1903:         PREFETCH_NTA(&v[16]);
1904:         idx = 4*(*vi++);
1905: 
1906:         /* Demote solution (so far) from double to float */
1907:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1909:         /* 4x4 Matrix-Vector product with negative accumulation: */
1910:         SSE_INLINE_BEGIN_2(tmpx,v)
1911:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1913:           /* First Column */
1914:           SSE_COPY_PS(XMM0,XMM6)
1915:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1916:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1917:           SSE_SUB_PS(XMM7,XMM0)
1918: 
1919:           /* Second Column */
1920:           SSE_COPY_PS(XMM1,XMM6)
1921:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1922:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1923:           SSE_SUB_PS(XMM7,XMM1)
1924: 
1925:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1926: 
1927:           /* Third Column */
1928:           SSE_COPY_PS(XMM2,XMM6)
1929:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1930:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1931:           SSE_SUB_PS(XMM7,XMM2)

1933:           /* Fourth Column */
1934:           SSE_COPY_PS(XMM3,XMM6)
1935:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1936:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1937:           SSE_SUB_PS(XMM7,XMM3)
1938:         SSE_INLINE_END_2
1939: 
1940:         v  += 16;
1941:       }
1942:       idx = 4*i;
1943:       v   = aa + 16*ai[++i];
1944:       PREFETCH_NTA(v);
1945:       STORE_PS(tmps,XMM7);

1947:       /* Promote result from float to double */
1948:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1949:     }
1950:     /* backward solve the upper triangular */
1951:     idt  = 4*(n-1);
1952:     ai16 = 16*diag[n-1];
1953:     v    = aa + ai16 + 16;
1954:     for (i=n-1; i>=0;){
1955:       PREFETCH_NTA(&v[8]);
1956:       vi = aj + diag[i] + 1;
1957:       nz = ai[i+1] - diag[i] - 1;
1958: 
1959:       /* Demote accumulator from double to float */
1960:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1961:       LOAD_PS(tmps,XMM7);

1963:       while (nz--) {
1964:         PREFETCH_NTA(&v[16]);
1965:         idx = 4*(*vi++);

1967:         /* Demote solution (so far) from double to float */
1968:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1970:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1971:         SSE_INLINE_BEGIN_2(tmpx,v)
1972:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1974:           /* First Column */
1975:           SSE_COPY_PS(XMM0,XMM6)
1976:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1977:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1978:           SSE_SUB_PS(XMM7,XMM0)

1980:           /* Second Column */
1981:           SSE_COPY_PS(XMM1,XMM6)
1982:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1983:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1984:           SSE_SUB_PS(XMM7,XMM1)

1986:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1987: 
1988:           /* Third Column */
1989:           SSE_COPY_PS(XMM2,XMM6)
1990:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1991:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1992:           SSE_SUB_PS(XMM7,XMM2)

1994:           /* Fourth Column */
1995:           SSE_COPY_PS(XMM3,XMM6)
1996:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1997:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1998:           SSE_SUB_PS(XMM7,XMM3)
1999:         SSE_INLINE_END_2
2000:         v  += 16;
2001:       }
2002:       v    = aa + ai16;
2003:       ai16 = 16*diag[--i];
2004:       PREFETCH_NTA(aa+ai16+16);
2005:       /* 
2006:          Scale the result by the diagonal 4x4 block, 
2007:          which was inverted as part of the factorization
2008:       */
2009:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2010:         /* First Column */
2011:         SSE_COPY_PS(XMM0,XMM7)
2012:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2013:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2015:         /* Second Column */
2016:         SSE_COPY_PS(XMM1,XMM7)
2017:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2018:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2019:         SSE_ADD_PS(XMM0,XMM1)

2021:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2022: 
2023:         /* Third Column */
2024:         SSE_COPY_PS(XMM2,XMM7)
2025:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2026:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2027:         SSE_ADD_PS(XMM0,XMM2)

2029:         /* Fourth Column */
2030:         SSE_COPY_PS(XMM3,XMM7)
2031:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2032:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2033:         SSE_ADD_PS(XMM0,XMM3)
2034: 
2035:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2036:       SSE_INLINE_END_3

2038:       /* Promote solution from float to double */
2039:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2041:       /* Apply reordering to t and stream into x.    */
2042:       /* This way, x doesn't pollute the cache.      */
2043:       /* Be careful with size: 2 doubles = 4 floats! */
2044:       idc  = 4*(*c--);
2045:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2046:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2047:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2048:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2049:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2050:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2051:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2052:       SSE_INLINE_END_2
2053:       v    = aa + ai16 + 16;
2054:       idt -= 4;
2055:     }

2057:     ISRestoreIndices(isrow,&rout);
2058:     ISRestoreIndices(iscol,&cout);
2059:     VecRestoreArray(bb,&b);
2060:     VecRestoreArray(xx,&x);
2061:     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2062:   SSE_SCOPE_END;
2063:   return(0);
2064: }

2066: #endif


2069: /*
2070:       Special case where the matrix was ILU(0) factored in the natural
2071:    ordering. This eliminates the need for the column and row permutation.
2072: */
2073: #undef __FUNCT__  
2075: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2076: {
2077:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2078:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2079:   int             ierr,*diag = a->diag;
2080:   MatScalar       *aa=a->a;
2081:   PetscScalar     *x,*b;

2084:   VecGetArray(bb,&b);
2085:   VecGetArray(xx,&x);

2087: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2088:   {
2089:     static PetscScalar w[2000]; /* very BAD need to fix */
2090:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2091:   }
2092: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2093:   {
2094:     static PetscScalar w[2000]; /* very BAD need to fix */
2095:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2096:   }
2097: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2098:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099: #else
2100:   {
2101:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2102:     MatScalar    *v;
2103:     int          jdx,idt,idx,nz,*vi,i,ai16;

2105:   /* forward solve the lower triangular */
2106:   idx    = 0;
2107:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2108:   for (i=1; i<n; i++) {
2109:     v     =  aa      + 16*ai[i];
2110:     vi    =  aj      + ai[i];
2111:     nz    =  diag[i] - ai[i];
2112:     idx   +=  4;
2113:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2114:     while (nz--) {
2115:       jdx   = 4*(*vi++);
2116:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2121:       v    += 16;
2122:     }
2123:     x[idx]   = s1;
2124:     x[1+idx] = s2;
2125:     x[2+idx] = s3;
2126:     x[3+idx] = s4;
2127:   }
2128:   /* backward solve the upper triangular */
2129:   idt = 4*(n-1);
2130:   for (i=n-1; i>=0; i--){
2131:     ai16 = 16*diag[i];
2132:     v    = aa + ai16 + 16;
2133:     vi   = aj + diag[i] + 1;
2134:     nz   = ai[i+1] - diag[i] - 1;
2135:     s1 = x[idt];  s2 = x[1+idt];
2136:     s3 = x[2+idt];s4 = x[3+idt];
2137:     while (nz--) {
2138:       idx   = 4*(*vi++);
2139:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2144:       v    += 16;
2145:     }
2146:     v        = aa + ai16;
2147:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151:     idt -= 4;
2152:   }
2153:   }
2154: #endif

2156:   VecRestoreArray(bb,&b);
2157:   VecRestoreArray(xx,&x);
2158:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2159:   return(0);
2160: }

2162: #undef __FUNCT__  
2164: int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165: {
2166:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168:   int             ierr,*diag = a->diag;
2169:   MatScalar       *aa=a->a;
2170:   PetscScalar     *x,*b;

2173:   VecGetArray(bb,&b);
2174:   VecGetArray(xx,&x);

2176:   {
2177:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178:     MatScalar  *v,*t=(MatScalar *)x;
2179:     int        jdx,idt,idx,nz,*vi,i,ai16;

2181:     /* forward solve the lower triangular */
2182:     idx  = 0;
2183:     t[0] = (MatScalar)b[0];
2184:     t[1] = (MatScalar)b[1];
2185:     t[2] = (MatScalar)b[2];
2186:     t[3] = (MatScalar)b[3];
2187:     for (i=1; i<n; i++) {
2188:       v     =  aa      + 16*ai[i];
2189:       vi    =  aj      + ai[i];
2190:       nz    =  diag[i] - ai[i];
2191:       idx   +=  4;
2192:       s1 = (MatScalar)b[idx];
2193:       s2 = (MatScalar)b[1+idx];
2194:       s3 = (MatScalar)b[2+idx];
2195:       s4 = (MatScalar)b[3+idx];
2196:       while (nz--) {
2197:         jdx = 4*(*vi++);
2198:         x1  = t[jdx];
2199:         x2  = t[1+jdx];
2200:         x3  = t[2+jdx];
2201:         x4  = t[3+jdx];
2202:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206:         v    += 16;
2207:       }
2208:       t[idx]   = s1;
2209:       t[1+idx] = s2;
2210:       t[2+idx] = s3;
2211:       t[3+idx] = s4;
2212:     }
2213:     /* backward solve the upper triangular */
2214:     idt = 4*(n-1);
2215:     for (i=n-1; i>=0; i--){
2216:       ai16 = 16*diag[i];
2217:       v    = aa + ai16 + 16;
2218:       vi   = aj + diag[i] + 1;
2219:       nz   = ai[i+1] - diag[i] - 1;
2220:       s1   = t[idt];
2221:       s2   = t[1+idt];
2222:       s3   = t[2+idt];
2223:       s4   = t[3+idt];
2224:       while (nz--) {
2225:         idx = 4*(*vi++);
2226:         x1  = (MatScalar)x[idx];
2227:         x2  = (MatScalar)x[1+idx];
2228:         x3  = (MatScalar)x[2+idx];
2229:         x4  = (MatScalar)x[3+idx];
2230:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234:         v    += 16;
2235:       }
2236:       v        = aa + ai16;
2237:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241:       idt -= 4;
2242:     }
2243:   }

2245:   VecRestoreArray(bb,&b);
2246:   VecRestoreArray(xx,&x);
2247:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248:   return(0);
2249: }

2251: #if defined (PETSC_HAVE_SSE)

2253: #include PETSC_HAVE_SSE
2254: #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */
2255: #undef __FUNCT__
2257: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2258: {
2259:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2260:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2261:   int             ierr,*diag = a->diag;
2262:   MatScalar       *aa=a->a;
2263:   PetscScalar     *x,*b;

2266:   SSE_SCOPE_BEGIN;
2267:   /* 
2268:      Note: This code currently uses demotion of double
2269:      to float when performing the mixed-mode computation.
2270:      This may not be numerically reasonable for all applications.
2271:   */
2272:   PREFETCH_NTA(aa+16*ai[1]);

2274:   VecGetArrayFast(bb,&b);
2275:   VecGetArrayFast(xx,&x);
2276:   {
2277:     /* x will first be computed in single precision then promoted inplace to double */
2278:     MatScalar     *v,*t=(MatScalar *)x;
2279:     int           jdx,idt,idx,nz,*vi,i,ai16;

2281:     /* Forward solve the lower triangular factor. */

2283:     /* First block is the identity. */
2284:     idx  = 0;
2285:     CONVERT_DOUBLE4_FLOAT4(t,b);
2286:     v    =  aa + 16*ai[1];

2288:     for (i=1; i<n;) {
2289:       PREFETCH_NTA(&v[8]);
2290:       vi   =  aj      + ai[i];
2291:       nz   =  diag[i] - ai[i];
2292:       idx +=  4;

2294:       /* Demote RHS from double to float. */
2295:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2296:       LOAD_PS(&t[idx],XMM7);

2298:       while (nz--) {
2299:         PREFETCH_NTA(&v[16]);
2300:         jdx = *vi++;
2301: 
2302:         /* 4x4 Matrix-Vector product with negative accumulation: */
2303:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2304:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2306:           /* First Column */
2307:           SSE_COPY_PS(XMM0,XMM6)
2308:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2309:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2310:           SSE_SUB_PS(XMM7,XMM0)

2312:           /* Second Column */
2313:           SSE_COPY_PS(XMM1,XMM6)
2314:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2315:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2316:           SSE_SUB_PS(XMM7,XMM1)

2318:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2319: 
2320:           /* Third Column */
2321:           SSE_COPY_PS(XMM2,XMM6)
2322:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2323:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2324:           SSE_SUB_PS(XMM7,XMM2)

2326:           /* Fourth Column */
2327:           SSE_COPY_PS(XMM3,XMM6)
2328:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2329:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2330:           SSE_SUB_PS(XMM7,XMM3)
2331:         SSE_INLINE_END_2
2332: 
2333:         v  += 16;
2334:       }
2335:       v    =  aa + 16*ai[++i];
2336:       PREFETCH_NTA(v);
2337:       STORE_PS(&t[idx],XMM7);
2338:     }

2340:     /* Backward solve the upper triangular factor.*/

2342:     idt  = 4*(n-1);
2343:     ai16 = 16*diag[n-1];
2344:     v    = aa + ai16 + 16;
2345:     for (i=n-1; i>=0;){
2346:       PREFETCH_NTA(&v[8]);
2347:       vi = aj + diag[i] + 1;
2348:       nz = ai[i+1] - diag[i] - 1;
2349: 
2350:       LOAD_PS(&t[idt],XMM7);

2352:       while (nz--) {
2353:         PREFETCH_NTA(&v[16]);
2354:         idx = *vi++;

2356:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2357:         SSE_INLINE_BEGIN_2(&t[idx],v)
2358:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2360:           /* First Column */
2361:           SSE_COPY_PS(XMM0,XMM6)
2362:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2363:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2364:           SSE_SUB_PS(XMM7,XMM0)

2366:           /* Second Column */
2367:           SSE_COPY_PS(XMM1,XMM6)
2368:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2369:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2370:           SSE_SUB_PS(XMM7,XMM1)

2372:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2373: 
2374:           /* Third Column */
2375:           SSE_COPY_PS(XMM2,XMM6)
2376:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2377:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2378:           SSE_SUB_PS(XMM7,XMM2)

2380:           /* Fourth Column */
2381:           SSE_COPY_PS(XMM3,XMM6)
2382:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2383:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2384:           SSE_SUB_PS(XMM7,XMM3)
2385:         SSE_INLINE_END_2
2386:         v  += 16;
2387:       }
2388:       v    = aa + ai16;
2389:       ai16 = 16*diag[--i];
2390:       PREFETCH_NTA(aa+ai16+16);
2391:       /* 
2392:          Scale the result by the diagonal 4x4 block, 
2393:          which was inverted as part of the factorization
2394:       */
2395:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2396:         /* First Column */
2397:         SSE_COPY_PS(XMM0,XMM7)
2398:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2399:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2401:         /* Second Column */
2402:         SSE_COPY_PS(XMM1,XMM7)
2403:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2404:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2405:         SSE_ADD_PS(XMM0,XMM1)

2407:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2408: 
2409:         /* Third Column */
2410:         SSE_COPY_PS(XMM2,XMM7)
2411:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2412:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2413:         SSE_ADD_PS(XMM0,XMM2)

2415:         /* Fourth Column */
2416:         SSE_COPY_PS(XMM3,XMM7)
2417:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2418:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2419:         SSE_ADD_PS(XMM0,XMM3)

2421:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2422:       SSE_INLINE_END_3

2424:       v    = aa + ai16 + 16;
2425:       idt -= 4;
2426:     }

2428:     /* Convert t from single precision back to double precision (inplace)*/
2429:     idt = 4*(n-1);
2430:     for (i=n-1;i>=0;i--) {
2431:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2432:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2433:       PetscScalar *xtemp=&x[idt];
2434:       MatScalar   *ttemp=&t[idt];
2435:       xtemp[3] = (PetscScalar)ttemp[3];
2436:       xtemp[2] = (PetscScalar)ttemp[2];
2437:       xtemp[1] = (PetscScalar)ttemp[1];
2438:       xtemp[0] = (PetscScalar)ttemp[0];
2439:       idt -= 4;
2440:     }

2442:   } /* End of artificial scope. */
2443:   VecRestoreArrayFast(bb,&b);
2444:   VecRestoreArrayFast(xx,&x);
2445:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2446:   SSE_SCOPE_END;
2447:   return(0);
2448: }

2450: #endif
2451: #undef __FUNCT__  
2453: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2454: {
2455:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2456:   IS              iscol=a->col,isrow=a->row;
2457:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2458:   int             *diag = a->diag;
2459:   MatScalar       *aa=a->a,*v;
2460:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;

2463:   VecGetArray(bb,&b);
2464:   VecGetArray(xx,&x);
2465:   t  = a->solve_work;

2467:   ISGetIndices(isrow,&rout); r = rout;
2468:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2470:   /* forward solve the lower triangular */
2471:   idx    = 3*(*r++);
2472:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2473:   for (i=1; i<n; i++) {
2474:     v     = aa + 9*ai[i];
2475:     vi    = aj + ai[i];
2476:     nz    = diag[i] - ai[i];
2477:     idx   = 3*(*r++);
2478:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2479:     while (nz--) {
2480:       idx   = 3*(*vi++);
2481:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2482:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2483:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2484:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2485:       v += 9;
2486:     }
2487:     idx = 3*i;
2488:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2489:   }
2490:   /* backward solve the upper triangular */
2491:   for (i=n-1; i>=0; i--){
2492:     v    = aa + 9*diag[i] + 9;
2493:     vi   = aj + diag[i] + 1;
2494:     nz   = ai[i+1] - diag[i] - 1;
2495:     idt  = 3*i;
2496:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2497:     while (nz--) {
2498:       idx   = 3*(*vi++);
2499:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2500:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2501:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2502:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2503:       v += 9;
2504:     }
2505:     idc = 3*(*c--);
2506:     v   = aa + 9*diag[i];
2507:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2508:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2509:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2510:   }
2511:   ISRestoreIndices(isrow,&rout);
2512:   ISRestoreIndices(iscol,&cout);
2513:   VecRestoreArray(bb,&b);
2514:   VecRestoreArray(xx,&x);
2515:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2516:   return(0);
2517: }

2519: /*
2520:       Special case where the matrix was ILU(0) factored in the natural
2521:    ordering. This eliminates the need for the column and row permutation.
2522: */
2523: #undef __FUNCT__  
2525: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2526: {
2527:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2528:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2529:   int             ierr,*diag = a->diag;
2530:   MatScalar       *aa=a->a,*v;
2531:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2532:   int             jdx,idt,idx,nz,*vi,i;

2535:   VecGetArray(bb,&b);
2536:   VecGetArray(xx,&x);


2539:   /* forward solve the lower triangular */
2540:   idx    = 0;
2541:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2542:   for (i=1; i<n; i++) {
2543:     v     =  aa      + 9*ai[i];
2544:     vi    =  aj      + ai[i];
2545:     nz    =  diag[i] - ai[i];
2546:     idx   +=  3;
2547:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2548:     while (nz--) {
2549:       jdx   = 3*(*vi++);
2550:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2551:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2552:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2553:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2554:       v    += 9;
2555:     }
2556:     x[idx]   = s1;
2557:     x[1+idx] = s2;
2558:     x[2+idx] = s3;
2559:   }
2560:   /* backward solve the upper triangular */
2561:   for (i=n-1; i>=0; i--){
2562:     v    = aa + 9*diag[i] + 9;
2563:     vi   = aj + diag[i] + 1;
2564:     nz   = ai[i+1] - diag[i] - 1;
2565:     idt  = 3*i;
2566:     s1 = x[idt];  s2 = x[1+idt];
2567:     s3 = x[2+idt];
2568:     while (nz--) {
2569:       idx   = 3*(*vi++);
2570:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2571:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2572:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2573:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2574:       v    += 9;
2575:     }
2576:     v        = aa +  9*diag[i];
2577:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2578:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2579:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2580:   }

2582:   VecRestoreArray(bb,&b);
2583:   VecRestoreArray(xx,&x);
2584:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2585:   return(0);
2586: }

2588: #undef __FUNCT__  
2590: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2591: {
2592:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2593:   IS              iscol=a->col,isrow=a->row;
2594:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2595:   int             *diag = a->diag;
2596:   MatScalar       *aa=a->a,*v;
2597:   PetscScalar     *x,*b,s1,s2,x1,x2,*t;

2600:   VecGetArray(bb,&b);
2601:   VecGetArray(xx,&x);
2602:   t  = a->solve_work;

2604:   ISGetIndices(isrow,&rout); r = rout;
2605:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2607:   /* forward solve the lower triangular */
2608:   idx    = 2*(*r++);
2609:   t[0] = b[idx]; t[1] = b[1+idx];
2610:   for (i=1; i<n; i++) {
2611:     v     = aa + 4*ai[i];
2612:     vi    = aj + ai[i];
2613:     nz    = diag[i] - ai[i];
2614:     idx   = 2*(*r++);
2615:     s1  = b[idx]; s2 = b[1+idx];
2616:     while (nz--) {
2617:       idx   = 2*(*vi++);
2618:       x1    = t[idx]; x2 = t[1+idx];
2619:       s1 -= v[0]*x1 + v[2]*x2;
2620:       s2 -= v[1]*x1 + v[3]*x2;
2621:       v += 4;
2622:     }
2623:     idx = 2*i;
2624:     t[idx] = s1; t[1+idx] = s2;
2625:   }
2626:   /* backward solve the upper triangular */
2627:   for (i=n-1; i>=0; i--){
2628:     v    = aa + 4*diag[i] + 4;
2629:     vi   = aj + diag[i] + 1;
2630:     nz   = ai[i+1] - diag[i] - 1;
2631:     idt  = 2*i;
2632:     s1 = t[idt]; s2 = t[1+idt];
2633:     while (nz--) {
2634:       idx   = 2*(*vi++);
2635:       x1    = t[idx]; x2 = t[1+idx];
2636:       s1 -= v[0]*x1 + v[2]*x2;
2637:       s2 -= v[1]*x1 + v[3]*x2;
2638:       v += 4;
2639:     }
2640:     idc = 2*(*c--);
2641:     v   = aa + 4*diag[i];
2642:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2643:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2644:   }
2645:   ISRestoreIndices(isrow,&rout);
2646:   ISRestoreIndices(iscol,&cout);
2647:   VecRestoreArray(bb,&b);
2648:   VecRestoreArray(xx,&x);
2649:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2650:   return(0);
2651: }

2653: /*
2654:       Special case where the matrix was ILU(0) factored in the natural
2655:    ordering. This eliminates the need for the column and row permutation.
2656: */
2657: #undef __FUNCT__  
2659: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2660: {
2661:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2662:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2663:   int             ierr,*diag = a->diag;
2664:   MatScalar       *aa=a->a,*v;
2665:   PetscScalar     *x,*b,s1,s2,x1,x2;
2666:   int             jdx,idt,idx,nz,*vi,i;

2669:   VecGetArray(bb,&b);
2670:   VecGetArray(xx,&x);

2672:   /* forward solve the lower triangular */
2673:   idx    = 0;
2674:   x[0]   = b[0]; x[1] = b[1];
2675:   for (i=1; i<n; i++) {
2676:     v     =  aa      + 4*ai[i];
2677:     vi    =  aj      + ai[i];
2678:     nz    =  diag[i] - ai[i];
2679:     idx   +=  2;
2680:     s1  =  b[idx];s2 = b[1+idx];
2681:     while (nz--) {
2682:       jdx   = 2*(*vi++);
2683:       x1    = x[jdx];x2 = x[1+jdx];
2684:       s1 -= v[0]*x1 + v[2]*x2;
2685:       s2 -= v[1]*x1 + v[3]*x2;
2686:       v    += 4;
2687:     }
2688:     x[idx]   = s1;
2689:     x[1+idx] = s2;
2690:   }
2691:   /* backward solve the upper triangular */
2692:   for (i=n-1; i>=0; i--){
2693:     v    = aa + 4*diag[i] + 4;
2694:     vi   = aj + diag[i] + 1;
2695:     nz   = ai[i+1] - diag[i] - 1;
2696:     idt  = 2*i;
2697:     s1 = x[idt];  s2 = x[1+idt];
2698:     while (nz--) {
2699:       idx   = 2*(*vi++);
2700:       x1    = x[idx];   x2 = x[1+idx];
2701:       s1 -= v[0]*x1 + v[2]*x2;
2702:       s2 -= v[1]*x1 + v[3]*x2;
2703:       v    += 4;
2704:     }
2705:     v        = aa +  4*diag[i];
2706:     x[idt]   = v[0]*s1 + v[2]*s2;
2707:     x[1+idt] = v[1]*s1 + v[3]*s2;
2708:   }

2710:   VecRestoreArray(bb,&b);
2711:   VecRestoreArray(xx,&x);
2712:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2713:   return(0);
2714: }

2716: #undef __FUNCT__  
2718: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2719: {
2720:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2721:   IS              iscol=a->col,isrow=a->row;
2722:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2723:   int             *diag = a->diag;
2724:   MatScalar       *aa=a->a,*v;
2725:   PetscScalar     *x,*b,s1,*t;

2728:   if (!n) return(0);

2730:   VecGetArray(bb,&b);
2731:   VecGetArray(xx,&x);
2732:   t  = a->solve_work;

2734:   ISGetIndices(isrow,&rout); r = rout;
2735:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2737:   /* forward solve the lower triangular */
2738:   t[0] = b[*r++];
2739:   for (i=1; i<n; i++) {
2740:     v     = aa + ai[i];
2741:     vi    = aj + ai[i];
2742:     nz    = diag[i] - ai[i];
2743:     s1  = b[*r++];
2744:     while (nz--) {
2745:       s1 -= (*v++)*t[*vi++];
2746:     }
2747:     t[i] = s1;
2748:   }
2749:   /* backward solve the upper triangular */
2750:   for (i=n-1; i>=0; i--){
2751:     v    = aa + diag[i] + 1;
2752:     vi   = aj + diag[i] + 1;
2753:     nz   = ai[i+1] - diag[i] - 1;
2754:     s1 = t[i];
2755:     while (nz--) {
2756:       s1 -= (*v++)*t[*vi++];
2757:     }
2758:     x[*c--] = t[i] = aa[diag[i]]*s1;
2759:   }

2761:   ISRestoreIndices(isrow,&rout);
2762:   ISRestoreIndices(iscol,&cout);
2763:   VecRestoreArray(bb,&b);
2764:   VecRestoreArray(xx,&x);
2765:   PetscLogFlops(2*1*(a->nz) - A->n);
2766:   return(0);
2767: }
2768: /*
2769:       Special case where the matrix was ILU(0) factored in the natural
2770:    ordering. This eliminates the need for the column and row permutation.
2771: */
2772: #undef __FUNCT__  
2774: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2775: {
2776:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2777:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2778:   int             ierr,*diag = a->diag;
2779:   MatScalar       *aa=a->a;
2780:   PetscScalar     *x,*b;
2781:   PetscScalar     s1,x1;
2782:   MatScalar       *v;
2783:   int             jdx,idt,idx,nz,*vi,i;

2786:   VecGetArray(bb,&b);
2787:   VecGetArray(xx,&x);

2789:   /* forward solve the lower triangular */
2790:   idx    = 0;
2791:   x[0]   = b[0];
2792:   for (i=1; i<n; i++) {
2793:     v     =  aa      + ai[i];
2794:     vi    =  aj      + ai[i];
2795:     nz    =  diag[i] - ai[i];
2796:     idx   +=  1;
2797:     s1  =  b[idx];
2798:     while (nz--) {
2799:       jdx   = *vi++;
2800:       x1    = x[jdx];
2801:       s1 -= v[0]*x1;
2802:       v    += 1;
2803:     }
2804:     x[idx]   = s1;
2805:   }
2806:   /* backward solve the upper triangular */
2807:   for (i=n-1; i>=0; i--){
2808:     v    = aa + diag[i] + 1;
2809:     vi   = aj + diag[i] + 1;
2810:     nz   = ai[i+1] - diag[i] - 1;
2811:     idt  = i;
2812:     s1 = x[idt];
2813:     while (nz--) {
2814:       idx   = *vi++;
2815:       x1    = x[idx];
2816:       s1 -= v[0]*x1;
2817:       v    += 1;
2818:     }
2819:     v        = aa +  diag[i];
2820:     x[idt]   = v[0]*s1;
2821:   }
2822:   VecRestoreArray(bb,&b);
2823:   VecRestoreArray(xx,&x);
2824:   PetscLogFlops(2*(a->nz) - A->n);
2825:   return(0);
2826: }

2828: /* ----------------------------------------------------------------*/
2829: /*
2830:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
2831:    except that the data structure of Mat_SeqAIJ is slightly different.
2832:    Not a good example of code reuse.
2833: */
2834: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);

2836: #undef __FUNCT__  
2838: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
2839: {
2840:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
2841:   IS          isicol;
2842:   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
2843:   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2844:   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
2845:   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
2846:   PetscTruth  col_identity,row_identity;
2847:   PetscReal   f;

2850:   f             = info->fill;
2851:   levels        = (int)info->levels;
2852:   diagonal_fill = (int)info->diagonal_fill;
2853:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
2854:   ISIdentity(isrow,&row_identity);
2855:   ISIdentity(iscol,&col_identity);

2857:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2858:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
2859:     (*fact)->factor = FACTOR_LU;
2860:     b               = (Mat_SeqBAIJ*)(*fact)->data;
2861:     if (!b->diag) {
2862:       MatMarkDiagonal_SeqBAIJ(*fact);
2863:     }
2864:     MatMissingDiagonal_SeqBAIJ(*fact);
2865:     b->row        = isrow;
2866:     b->col        = iscol;
2867:     ierr          = PetscObjectReference((PetscObject)isrow);
2868:     ierr          = PetscObjectReference((PetscObject)iscol);
2869:     b->icol       = isicol;
2870:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
2871:     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
2872:   } else { /* general case perform the symbolic factorization */
2873:     ISGetIndices(isrow,&r);
2874:     ISGetIndices(isicol,&ic);

2876:     /* get new row pointers */
2877:     PetscMalloc((n+1)*sizeof(int),&ainew);
2878:     ainew[0] = 0;
2879:     /* don't know how many column pointers are needed so estimate */
2880:     jmax = (int)(f*ai[n] + 1);
2881:     PetscMalloc((jmax)*sizeof(int),&ajnew);
2882:     /* ajfill is level of fill for each fill entry */
2883:     PetscMalloc((jmax)*sizeof(int),&ajfill);
2884:     /* fill is a linked list of nonzeros in active row */
2885:     PetscMalloc((n+1)*sizeof(int),&fill);
2886:     /* im is level for each filled value */
2887:     PetscMalloc((n+1)*sizeof(int),&im);
2888:     /* dloc is location of diagonal in factor */
2889:     PetscMalloc((n+1)*sizeof(int),&dloc);
2890:     dloc[0]  = 0;
2891:     for (prow=0; prow<n; prow++) {

2893:       /* copy prow into linked list */
2894:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
2895:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
2896:       xi         = aj + ai[r[prow]];
2897:       fill[n]    = n;
2898:       fill[prow] = -1; /* marker for diagonal entry */
2899:       while (nz--) {
2900:         fm  = n;
2901:         idx = ic[*xi++];
2902:         do {
2903:           m  = fm;
2904:           fm = fill[m];
2905:         } while (fm < idx);
2906:         fill[m]   = idx;
2907:         fill[idx] = fm;
2908:         im[idx]   = 0;
2909:       }

2911:       /* make sure diagonal entry is included */
2912:       if (diagonal_fill && fill[prow] == -1) {
2913:         fm = n;
2914:         while (fill[fm] < prow) fm = fill[fm];
2915:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
2916:         fill[fm]   = prow;
2917:         im[prow]   = 0;
2918:         nzf++;
2919:         dcount++;
2920:       }

2922:       nzi = 0;
2923:       row = fill[n];
2924:       while (row < prow) {
2925:         incrlev = im[row] + 1;
2926:         nz      = dloc[row];
2927:         xi      = ajnew  + ainew[row] + nz + 1;
2928:         flev    = ajfill + ainew[row] + nz + 1;
2929:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
2930:         fm      = row;
2931:         while (nnz-- > 0) {
2932:           idx = *xi++;
2933:           if (*flev + incrlev > levels) {
2934:             flev++;
2935:             continue;
2936:           }
2937:           do {
2938:             m  = fm;
2939:             fm = fill[m];
2940:           } while (fm < idx);
2941:           if (fm != idx) {
2942:             im[idx]   = *flev + incrlev;
2943:             fill[m]   = idx;
2944:             fill[idx] = fm;
2945:             fm        = idx;
2946:             nzf++;
2947:           } else {
2948:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
2949:           }
2950:           flev++;
2951:         }
2952:         row = fill[row];
2953:         nzi++;
2954:       }
2955:       /* copy new filled row into permanent storage */
2956:       ainew[prow+1] = ainew[prow] + nzf;
2957:       if (ainew[prow+1] > jmax) {

2959:         /* estimate how much additional space we will need */
2960:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2961:         /* just double the memory each time */
2962:         int maxadd = jmax;
2963:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
2964:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
2965:         jmax += maxadd;

2967:         /* allocate a longer ajnew and ajfill */
2968:         PetscMalloc(jmax*sizeof(int),&xi);
2969:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
2970:         PetscFree(ajnew);
2971:         ajnew = xi;
2972:         PetscMalloc(jmax*sizeof(int),&xi);
2973:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
2974:         PetscFree(ajfill);
2975:         ajfill = xi;
2976:         reallocate++; /* count how many reallocations are needed */
2977:       }
2978:       xi          = ajnew + ainew[prow];
2979:       flev        = ajfill + ainew[prow];
2980:       dloc[prow]  = nzi;
2981:       fm          = fill[n];
2982:       while (nzf--) {
2983:         *xi++   = fm;
2984:         *flev++ = im[fm];
2985:         fm      = fill[fm];
2986:       }
2987:       /* make sure row has diagonal entry */
2988:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
2989:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrixn
2990:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2991:       }
2992:     }
2993:     PetscFree(ajfill);
2994:     ISRestoreIndices(isrow,&r);
2995:     ISRestoreIndices(isicol,&ic);
2996:     PetscFree(fill);
2997:     PetscFree(im);

2999:     {
3000:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3001:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %gn",reallocate,f,af);
3002:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use n",af);
3003:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);n",af);
3004:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.n");
3005:       if (diagonal_fill) {
3006:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3007:       }
3008:     }

3010:     /* put together the new matrix */
3011:     MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
3012:     PetscLogObjectParent(*fact,isicol);
3013:     b = (Mat_SeqBAIJ*)(*fact)->data;
3014:     PetscFree(b->imax);
3015:     b->singlemalloc = PETSC_FALSE;
3016:     len = bs2*ainew[n]*sizeof(MatScalar);
3017:     /* the next line frees the default space generated by the Create() */
3018:     PetscFree(b->a);
3019:     PetscFree(b->ilen);
3020:     PetscMalloc(len,&b->a);
3021:     b->j          = ajnew;
3022:     b->i          = ainew;
3023:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3024:     b->diag       = dloc;
3025:     b->ilen       = 0;
3026:     b->imax       = 0;
3027:     b->row        = isrow;
3028:     b->col        = iscol;
3029:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3030:     ierr          = PetscObjectReference((PetscObject)isrow);
3031:     ierr          = PetscObjectReference((PetscObject)iscol);
3032:     b->icol       = isicol;
3033:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3034:     /* In b structure:  Free imax, ilen, old a, old j.  
3035:        Allocate dloc, solve_work, new a, new j */
3036:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3037:     b->maxnz          = b->nz = ainew[n];
3038:     (*fact)->factor   = FACTOR_LU;

3040:     (*fact)->info.factor_mallocs    = reallocate;
3041:     (*fact)->info.fill_ratio_given  = f;
3042:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3043:   }

3045:   if (row_identity && col_identity) {
3046:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3047:   }
3048:   return(0);
3049: }

3051: #undef __FUNCT__
3053: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3054: {
3055:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3056:   int i,*aj=a->j,nz=a->nz;
3057:   for (i=0;i<nz;i++) {
3058:     aj[i]=  aj[i]/4;
3059:   }
3060:   return(0);
3061: }

3063: #undef __FUNCT__
3065: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3066: {
3067:   /*
3068:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3069:       with natural ordering
3070:   */
3071:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;

3074:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3075:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3076:   switch (a->bs) {
3077:   case 1:
3078:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3079:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1n");
3080:     break;
3081:   case 2:
3082:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3083:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2n");
3084:     break;
3085:   case 3:
3086:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3087:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3n");
3088:     break;
3089:   case 4:
3090: #if defined(PETSC_USE_MAT_SINGLE)
3091:     {
3092:       PetscTruth  sse_enabled_local;
3093:       int         ierr;
3094:       PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3095:       if (sse_enabled_local) {
3096: #  if defined(PETSC_HAVE_SSE)
3097:         /* Scale the column indices for easier indexing in MatSolve. */
3098:         int i,*aj=a->j,nz=a->nz;
3099:         for (i=0;i<nz;i++) {
3100:           aj[i] *= 4;
3101:         }
3102:         inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3103:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3104:         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4n");
3105: #  else
3106:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3107:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3108: #  endif
3109:       } else {
3110:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3111:         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4n");
3112:       }
3113:     }
3114: #else
3115:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3116:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4n");
3117: #endif
3118:     break;
3119:   case 5:
3120:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3121:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5n");
3122:     break;
3123:   case 6:
3124:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3125:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6n");
3126:     break;
3127:   case 7:
3128:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3129:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7n");
3130:     break;
3131:   }
3132:   return(0);
3133: }

3135: #undef __FUNCT__
3137: int MatSeqBAIJ_UpdateSolvers(Mat A)
3138: {
3139:   /*
3140:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3141:       with natural ordering
3142:   */
3143:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3144:   IS          row = a->row, col = a->col;
3145:   PetscTruth  row_identity, col_identity;
3146:   PetscTruth  use_natural;
3147:   int         ierr;


3151:   use_natural = PETSC_FALSE;

3153:   ISIdentity(row,&row_identity);
3154:   ISIdentity(col,&col_identity);

3156:   if (row_identity && col_identity) {
3157:     use_natural = PETSC_TRUE;
3158:   } else {
3159:     use_natural = PETSC_FALSE;
3160:   }
3161:   switch (a->bs) {
3162:   case 1:
3163:     if (use_natural) {
3164:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3165:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3166:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1n");
3167:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
3168:     } else {
3169:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3170:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3171:     }
3172:     break;
3173:   case 2:
3174:     if (use_natural) {
3175:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3176:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3177:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2n");
3178:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
3179:     } else {
3180:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3181:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3182:     }
3183:     break;
3184:   case 3:
3185:     if (use_natural) {
3186:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3187:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3188:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3n");
3189:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4n");
3190:     } else {
3191:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3192:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3193:     }
3194:     break;
3195:   case 4:
3196:     {
3197:       PetscTruth sse_enabled_local;
3198:       PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3199:       if (use_natural) {
3200: #if defined(PETSC_USE_MAT_SINGLE)
3201:         if (sse_enabled_local) { /* Natural + Single + SSE */
3202: #  if defined(PETSC_HAVE_SSE)
3203:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3204:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, natural ordering solve BS=4n");
3205: #  else
3206:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3207:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3208: #  endif
3209:         } else { /* Natural + Single */
3210:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3211:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4n");
3212:         }
3213: #else
3214:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3215:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4n");
3216: #endif
3217:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3218:         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4n");
3219:       } else { /* Arbitrary ordering */
3220: #if defined(PETSC_USE_MAT_SINGLE)
3221:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3222: #  if defined(PETSC_HAVE_SSE)
3223:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3224:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4n");
3225: #  else
3226:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3227:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3228: #  endif
3229:         } else { /* Arbitrary + Single */
3230:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3231:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4n");
3232:         }
3233: #else
3234:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3235: #endif
3236:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3237:       }
3238:     }
3239:     break;
3240:   case 5:
3241:     if (use_natural) {
3242:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3243:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3244:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
3245:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5n");
3246:     } else {
3247:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3248:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3249:     }
3250:     break;
3251:   case 6:
3252:     if (use_natural) {
3253:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3254:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3255:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
3256:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6n");
3257:     } else {
3258:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3259:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3260:     }
3261:     break;
3262:   case 7:
3263:     if (use_natural) {
3264:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3265:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3266:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
3267:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7n");
3268:     } else {
3269:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3270:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3271:     }
3272:     break;
3273:   default:
3274:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3275:     break;
3276:   }
3277:   return(0);
3278: }

3280: #undef __FUNCT__
3282: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3286:   MatSeqBAIJ_UpdateSolvers(A);
3287:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3288:     (*A->ops->solve)(A,x,y);
3289:   } else {
3290:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3291:   }
3292:   return(0);
3293: }

3295: #undef __FUNCT__
3297: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3301:   MatSeqBAIJ_UpdateSolvers(A);
3302:   (*A->ops->solvetranspose)(A,x,y);
3303:   return(0);
3304: }