Actual source code: baijfact2.c

  1: /*
  2:     Factorization code for BAIJ format. 
  3: */

 5:  #include src/mat/impls/baij/seq/baij.h
 6:  #include src/inline/ilu.h
 7:  #include src/inline/dot.h

 11: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 12: {
 13:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 15:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 16:   PetscInt       *diag = a->diag;
 17:   MatScalar      *aa=a->a,*v;
 18:   PetscScalar    s1,*x,*b;

 21:   VecCopy(bb,xx);
 22:   VecGetArray(bb,&b);
 23:   VecGetArray(xx,&x);
 24: 
 25:   /* forward solve the U^T */
 26:   for (i=0; i<n; i++) {

 28:     v     = aa + diag[i];
 29:     /* multiply by the inverse of the block diagonal */
 30:     s1    = (*v++)*x[i];
 31:     vi    = aj + diag[i] + 1;
 32:     nz    = ai[i+1] - diag[i] - 1;
 33:     while (nz--) {
 34:       x[*vi++]  -= (*v++)*s1;
 35:     }
 36:     x[i]   = s1;
 37:   }
 38:   /* backward solve the L^T */
 39:   for (i=n-1; i>=0; i--){
 40:     v    = aa + diag[i] - 1;
 41:     vi   = aj + diag[i] - 1;
 42:     nz   = diag[i] - ai[i];
 43:     s1   = x[i];
 44:     while (nz--) {
 45:       x[*vi--]   -=  (*v--)*s1;
 46:     }
 47:   }
 48:   VecRestoreArray(bb,&b);
 49:   VecRestoreArray(xx,&x);
 50:   PetscLogFlops(2*(a->nz) - A->n);
 51:   return(0);
 52: }

 56: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 57: {
 58:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 60:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 61:   PetscInt       *diag = a->diag,oidx;
 62:   MatScalar      *aa=a->a,*v;
 63:   PetscScalar    s1,s2,x1,x2;
 64:   PetscScalar    *x,*b;

 67:   VecCopy(bb,xx);
 68:   VecGetArray(bb,&b);
 69:   VecGetArray(xx,&x);

 71:   /* forward solve the U^T */
 72:   idx = 0;
 73:   for (i=0; i<n; i++) {

 75:     v     = aa + 4*diag[i];
 76:     /* multiply by the inverse of the block diagonal */
 77:     x1 = x[idx];   x2 = x[1+idx];
 78:     s1 = v[0]*x1  +  v[1]*x2;
 79:     s2 = v[2]*x1  +  v[3]*x2;
 80:     v += 4;

 82:     vi    = aj + diag[i] + 1;
 83:     nz    = ai[i+1] - diag[i] - 1;
 84:     while (nz--) {
 85:       oidx = 2*(*vi++);
 86:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 87:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 88:       v  += 4;
 89:     }
 90:     x[idx]   = s1;x[1+idx] = s2;
 91:     idx += 2;
 92:   }
 93:   /* backward solve the L^T */
 94:   for (i=n-1; i>=0; i--){
 95:     v    = aa + 4*diag[i] - 4;
 96:     vi   = aj + diag[i] - 1;
 97:     nz   = diag[i] - ai[i];
 98:     idt  = 2*i;
 99:     s1   = x[idt];  s2 = x[1+idt];
100:     while (nz--) {
101:       idx   = 2*(*vi--);
102:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104:       v -= 4;
105:     }
106:   }
107:   VecRestoreArray(bb,&b);
108:   VecRestoreArray(xx,&x);
109:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110:   return(0);
111: }

115: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116: {
117:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
120:   PetscInt       *diag = a->diag,oidx;
121:   MatScalar      *aa=a->a,*v;
122:   PetscScalar    s1,s2,s3,x1,x2,x3;
123:   PetscScalar    *x,*b;

126:   VecCopy(bb,xx);
127:   VecGetArray(bb,&b);
128:   VecGetArray(xx,&x);

130:   /* forward solve the U^T */
131:   idx = 0;
132:   for (i=0; i<n; i++) {

134:     v     = aa + 9*diag[i];
135:     /* multiply by the inverse of the block diagonal */
136:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
137:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
138:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
139:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
140:     v += 9;

142:     vi    = aj + diag[i] + 1;
143:     nz    = ai[i+1] - diag[i] - 1;
144:     while (nz--) {
145:       oidx = 3*(*vi++);
146:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
147:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
148:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
149:       v  += 9;
150:     }
151:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
152:     idx += 3;
153:   }
154:   /* backward solve the L^T */
155:   for (i=n-1; i>=0; i--){
156:     v    = aa + 9*diag[i] - 9;
157:     vi   = aj + diag[i] - 1;
158:     nz   = diag[i] - ai[i];
159:     idt  = 3*i;
160:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
161:     while (nz--) {
162:       idx   = 3*(*vi--);
163:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
164:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
165:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
166:       v -= 9;
167:     }
168:   }
169:   VecRestoreArray(bb,&b);
170:   VecRestoreArray(xx,&x);
171:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
172:   return(0);
173: }

177: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
178: {
179:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182:   PetscInt       *diag = a->diag,oidx;
183:   MatScalar      *aa=a->a,*v;
184:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
185:   PetscScalar    *x,*b;

188:   VecCopy(bb,xx);
189:   VecGetArray(bb,&b);
190:   VecGetArray(xx,&x);

192:   /* forward solve the U^T */
193:   idx = 0;
194:   for (i=0; i<n; i++) {

196:     v     = aa + 16*diag[i];
197:     /* multiply by the inverse of the block diagonal */
198:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
199:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
200:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
201:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
202:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
203:     v += 16;

205:     vi    = aj + diag[i] + 1;
206:     nz    = ai[i+1] - diag[i] - 1;
207:     while (nz--) {
208:       oidx = 4*(*vi++);
209:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
210:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
211:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
212:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
213:       v  += 16;
214:     }
215:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
216:     idx += 4;
217:   }
218:   /* backward solve the L^T */
219:   for (i=n-1; i>=0; i--){
220:     v    = aa + 16*diag[i] - 16;
221:     vi   = aj + diag[i] - 1;
222:     nz   = diag[i] - ai[i];
223:     idt  = 4*i;
224:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
225:     while (nz--) {
226:       idx   = 4*(*vi--);
227:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
228:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
229:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
230:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
231:       v -= 16;
232:     }
233:   }
234:   VecRestoreArray(bb,&b);
235:   VecRestoreArray(xx,&x);
236:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
237:   return(0);
238: }

242: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
243: {
244:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
246:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
247:   PetscInt       *diag = a->diag,oidx;
248:   MatScalar      *aa=a->a,*v;
249:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
250:   PetscScalar    *x,*b;

253:   VecCopy(bb,xx);
254:   VecGetArray(bb,&b);
255:   VecGetArray(xx,&x);

257:   /* forward solve the U^T */
258:   idx = 0;
259:   for (i=0; i<n; i++) {

261:     v     = aa + 25*diag[i];
262:     /* multiply by the inverse of the block diagonal */
263:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
265:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
266:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
267:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
268:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
269:     v += 25;

271:     vi    = aj + diag[i] + 1;
272:     nz    = ai[i+1] - diag[i] - 1;
273:     while (nz--) {
274:       oidx = 5*(*vi++);
275:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
276:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
277:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
278:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
279:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
280:       v  += 25;
281:     }
282:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
283:     idx += 5;
284:   }
285:   /* backward solve the L^T */
286:   for (i=n-1; i>=0; i--){
287:     v    = aa + 25*diag[i] - 25;
288:     vi   = aj + diag[i] - 1;
289:     nz   = diag[i] - ai[i];
290:     idt  = 5*i;
291:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
292:     while (nz--) {
293:       idx   = 5*(*vi--);
294:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
295:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
296:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
297:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
298:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
299:       v -= 25;
300:     }
301:   }
302:   VecRestoreArray(bb,&b);
303:   VecRestoreArray(xx,&x);
304:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
305:   return(0);
306: }

310: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
311: {
312:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
314:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
315:   PetscInt       *diag = a->diag,oidx;
316:   MatScalar      *aa=a->a,*v;
317:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
318:   PetscScalar    *x,*b;

321:   VecCopy(bb,xx);
322:   VecGetArray(bb,&b);
323:   VecGetArray(xx,&x);

325:   /* forward solve the U^T */
326:   idx = 0;
327:   for (i=0; i<n; i++) {

329:     v     = aa + 36*diag[i];
330:     /* multiply by the inverse of the block diagonal */
331:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
332:     x6    = x[5+idx];
333:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
334:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
335:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
336:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
337:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
338:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
339:     v += 36;

341:     vi    = aj + diag[i] + 1;
342:     nz    = ai[i+1] - diag[i] - 1;
343:     while (nz--) {
344:       oidx = 6*(*vi++);
345:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
346:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
347:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351:       v  += 36;
352:     }
353:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
354:     x[5+idx] = s6;
355:     idx += 6;
356:   }
357:   /* backward solve the L^T */
358:   for (i=n-1; i>=0; i--){
359:     v    = aa + 36*diag[i] - 36;
360:     vi   = aj + diag[i] - 1;
361:     nz   = diag[i] - ai[i];
362:     idt  = 6*i;
363:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364:     s6 = x[5+idt];
365:     while (nz--) {
366:       idx   = 6*(*vi--);
367:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
368:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
369:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
370:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
371:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
372:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
373:       v -= 36;
374:     }
375:   }
376:   VecRestoreArray(bb,&b);
377:   VecRestoreArray(xx,&x);
378:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
379:   return(0);
380: }

384: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
385: {
386:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
388:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
389:   PetscInt       *diag = a->diag,oidx;
390:   MatScalar      *aa=a->a,*v;
391:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
392:   PetscScalar    *x,*b;

395:   VecCopy(bb,xx);
396:   VecGetArray(bb,&b);
397:   VecGetArray(xx,&x);

399:   /* forward solve the U^T */
400:   idx = 0;
401:   for (i=0; i<n; i++) {

403:     v     = aa + 49*diag[i];
404:     /* multiply by the inverse of the block diagonal */
405:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
406:     x6    = x[5+idx]; x7 = x[6+idx];
407:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
408:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
409:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
410:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
411:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
412:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
413:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
414:     v += 49;

416:     vi    = aj + diag[i] + 1;
417:     nz    = ai[i+1] - diag[i] - 1;
418:     while (nz--) {
419:       oidx = 7*(*vi++);
420:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
421:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
422:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
423:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
424:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
425:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
426:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
427:       v  += 49;
428:     }
429:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
430:     x[5+idx] = s6;x[6+idx] = s7;
431:     idx += 7;
432:   }
433:   /* backward solve the L^T */
434:   for (i=n-1; i>=0; i--){
435:     v    = aa + 49*diag[i] - 49;
436:     vi   = aj + diag[i] - 1;
437:     nz   = diag[i] - ai[i];
438:     idt  = 7*i;
439:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
440:     s6 = x[5+idt];s7 = x[6+idt];
441:     while (nz--) {
442:       idx   = 7*(*vi--);
443:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
444:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
445:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
446:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
447:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
448:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
449:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
450:       v -= 49;
451:     }
452:   }
453:   VecRestoreArray(bb,&b);
454:   VecRestoreArray(xx,&x);
455:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
456:   return(0);
457: }

459: /*---------------------------------------------------------------------------------------------*/
462: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
463: {
464:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
465:   IS             iscol=a->col,isrow=a->row;
467:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
468:   PetscInt       *diag = a->diag;
469:   MatScalar      *aa=a->a,*v;
470:   PetscScalar    s1,*x,*b,*t;

473:   VecGetArray(bb,&b);
474:   VecGetArray(xx,&x);
475:   t  = a->solve_work;

477:   ISGetIndices(isrow,&rout); r = rout;
478:   ISGetIndices(iscol,&cout); c = cout;

480:   /* copy the b into temp work space according to permutation */
481:   for (i=0; i<n; i++) {
482:     t[i] = b[c[i]];
483:   }

485:   /* forward solve the U^T */
486:   for (i=0; i<n; i++) {

488:     v     = aa + diag[i];
489:     /* multiply by the inverse of the block diagonal */
490:     s1    = (*v++)*t[i];
491:     vi    = aj + diag[i] + 1;
492:     nz    = ai[i+1] - diag[i] - 1;
493:     while (nz--) {
494:       t[*vi++]  -= (*v++)*s1;
495:     }
496:     t[i]   = s1;
497:   }
498:   /* backward solve the L^T */
499:   for (i=n-1; i>=0; i--){
500:     v    = aa + diag[i] - 1;
501:     vi   = aj + diag[i] - 1;
502:     nz   = diag[i] - ai[i];
503:     s1   = t[i];
504:     while (nz--) {
505:       t[*vi--]   -=  (*v--)*s1;
506:     }
507:   }

509:   /* copy t into x according to permutation */
510:   for (i=0; i<n; i++) {
511:     x[r[i]]   = t[i];
512:   }

514:   ISRestoreIndices(isrow,&rout);
515:   ISRestoreIndices(iscol,&cout);
516:   VecRestoreArray(bb,&b);
517:   VecRestoreArray(xx,&x);
518:   PetscLogFlops(2*(a->nz) - A->n);
519:   return(0);
520: }

524: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
525: {
526:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
527:   IS             iscol=a->col,isrow=a->row;
529:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
530:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
531:   MatScalar      *aa=a->a,*v;
532:   PetscScalar    s1,s2,x1,x2;
533:   PetscScalar    *x,*b,*t;

536:   VecGetArray(bb,&b);
537:   VecGetArray(xx,&x);
538:   t  = a->solve_work;

540:   ISGetIndices(isrow,&rout); r = rout;
541:   ISGetIndices(iscol,&cout); c = cout;

543:   /* copy the b into temp work space according to permutation */
544:   ii = 0;
545:   for (i=0; i<n; i++) {
546:     ic      = 2*c[i];
547:     t[ii]   = b[ic];
548:     t[ii+1] = b[ic+1];
549:     ii += 2;
550:   }

552:   /* forward solve the U^T */
553:   idx = 0;
554:   for (i=0; i<n; i++) {

556:     v     = aa + 4*diag[i];
557:     /* multiply by the inverse of the block diagonal */
558:     x1    = t[idx];   x2 = t[1+idx];
559:     s1 = v[0]*x1  +  v[1]*x2;
560:     s2 = v[2]*x1  +  v[3]*x2;
561:     v += 4;

563:     vi    = aj + diag[i] + 1;
564:     nz    = ai[i+1] - diag[i] - 1;
565:     while (nz--) {
566:       oidx = 2*(*vi++);
567:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
568:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
569:       v  += 4;
570:     }
571:     t[idx]   = s1;t[1+idx] = s2;
572:     idx += 2;
573:   }
574:   /* backward solve the L^T */
575:   for (i=n-1; i>=0; i--){
576:     v    = aa + 4*diag[i] - 4;
577:     vi   = aj + diag[i] - 1;
578:     nz   = diag[i] - ai[i];
579:     idt  = 2*i;
580:     s1 = t[idt];  s2 = t[1+idt];
581:     while (nz--) {
582:       idx   = 2*(*vi--);
583:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
584:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
585:       v -= 4;
586:     }
587:   }

589:   /* copy t into x according to permutation */
590:   ii = 0;
591:   for (i=0; i<n; i++) {
592:     ir      = 2*r[i];
593:     x[ir]   = t[ii];
594:     x[ir+1] = t[ii+1];
595:     ii += 2;
596:   }

598:   ISRestoreIndices(isrow,&rout);
599:   ISRestoreIndices(iscol,&cout);
600:   VecRestoreArray(bb,&b);
601:   VecRestoreArray(xx,&x);
602:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
603:   return(0);
604: }

608: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
609: {
610:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
611:   IS             iscol=a->col,isrow=a->row;
613:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
614:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
615:   MatScalar      *aa=a->a,*v;
616:   PetscScalar    s1,s2,s3,x1,x2,x3;
617:   PetscScalar    *x,*b,*t;

620:   VecGetArray(bb,&b);
621:   VecGetArray(xx,&x);
622:   t  = a->solve_work;

624:   ISGetIndices(isrow,&rout); r = rout;
625:   ISGetIndices(iscol,&cout); c = cout;

627:   /* copy the b into temp work space according to permutation */
628:   ii = 0;
629:   for (i=0; i<n; i++) {
630:     ic      = 3*c[i];
631:     t[ii]   = b[ic];
632:     t[ii+1] = b[ic+1];
633:     t[ii+2] = b[ic+2];
634:     ii += 3;
635:   }

637:   /* forward solve the U^T */
638:   idx = 0;
639:   for (i=0; i<n; i++) {

641:     v     = aa + 9*diag[i];
642:     /* multiply by the inverse of the block diagonal */
643:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
644:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
645:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
646:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
647:     v += 9;

649:     vi    = aj + diag[i] + 1;
650:     nz    = ai[i+1] - diag[i] - 1;
651:     while (nz--) {
652:       oidx = 3*(*vi++);
653:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
654:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
655:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
656:       v  += 9;
657:     }
658:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
659:     idx += 3;
660:   }
661:   /* backward solve the L^T */
662:   for (i=n-1; i>=0; i--){
663:     v    = aa + 9*diag[i] - 9;
664:     vi   = aj + diag[i] - 1;
665:     nz   = diag[i] - ai[i];
666:     idt  = 3*i;
667:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
668:     while (nz--) {
669:       idx   = 3*(*vi--);
670:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
671:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
672:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
673:       v -= 9;
674:     }
675:   }

677:   /* copy t into x according to permutation */
678:   ii = 0;
679:   for (i=0; i<n; i++) {
680:     ir      = 3*r[i];
681:     x[ir]   = t[ii];
682:     x[ir+1] = t[ii+1];
683:     x[ir+2] = t[ii+2];
684:     ii += 3;
685:   }

687:   ISRestoreIndices(isrow,&rout);
688:   ISRestoreIndices(iscol,&cout);
689:   VecRestoreArray(bb,&b);
690:   VecRestoreArray(xx,&x);
691:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
692:   return(0);
693: }

697: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
698: {
699:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
700:   IS             iscol=a->col,isrow=a->row;
702:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
703:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
704:   MatScalar      *aa=a->a,*v;
705:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
706:   PetscScalar    *x,*b,*t;

709:   VecGetArray(bb,&b);
710:   VecGetArray(xx,&x);
711:   t  = a->solve_work;

713:   ISGetIndices(isrow,&rout); r = rout;
714:   ISGetIndices(iscol,&cout); c = cout;

716:   /* copy the b into temp work space according to permutation */
717:   ii = 0;
718:   for (i=0; i<n; i++) {
719:     ic      = 4*c[i];
720:     t[ii]   = b[ic];
721:     t[ii+1] = b[ic+1];
722:     t[ii+2] = b[ic+2];
723:     t[ii+3] = b[ic+3];
724:     ii += 4;
725:   }

727:   /* forward solve the U^T */
728:   idx = 0;
729:   for (i=0; i<n; i++) {

731:     v     = aa + 16*diag[i];
732:     /* multiply by the inverse of the block diagonal */
733:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
734:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
735:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
736:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
737:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
738:     v += 16;

740:     vi    = aj + diag[i] + 1;
741:     nz    = ai[i+1] - diag[i] - 1;
742:     while (nz--) {
743:       oidx = 4*(*vi++);
744:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
745:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
746:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
747:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
748:       v  += 16;
749:     }
750:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
751:     idx += 4;
752:   }
753:   /* backward solve the L^T */
754:   for (i=n-1; i>=0; i--){
755:     v    = aa + 16*diag[i] - 16;
756:     vi   = aj + diag[i] - 1;
757:     nz   = diag[i] - ai[i];
758:     idt  = 4*i;
759:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
760:     while (nz--) {
761:       idx   = 4*(*vi--);
762:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
763:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
764:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
765:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
766:       v -= 16;
767:     }
768:   }

770:   /* copy t into x according to permutation */
771:   ii = 0;
772:   for (i=0; i<n; i++) {
773:     ir      = 4*r[i];
774:     x[ir]   = t[ii];
775:     x[ir+1] = t[ii+1];
776:     x[ir+2] = t[ii+2];
777:     x[ir+3] = t[ii+3];
778:     ii += 4;
779:   }

781:   ISRestoreIndices(isrow,&rout);
782:   ISRestoreIndices(iscol,&cout);
783:   VecRestoreArray(bb,&b);
784:   VecRestoreArray(xx,&x);
785:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
786:   return(0);
787: }

791: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
792: {
793:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
794:   IS             iscol=a->col,isrow=a->row;
796:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
797:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
798:   MatScalar      *aa=a->a,*v;
799:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
800:   PetscScalar    *x,*b,*t;

803:   VecGetArray(bb,&b);
804:   VecGetArray(xx,&x);
805:   t  = a->solve_work;

807:   ISGetIndices(isrow,&rout); r = rout;
808:   ISGetIndices(iscol,&cout); c = cout;

810:   /* copy the b into temp work space according to permutation */
811:   ii = 0;
812:   for (i=0; i<n; i++) {
813:     ic      = 5*c[i];
814:     t[ii]   = b[ic];
815:     t[ii+1] = b[ic+1];
816:     t[ii+2] = b[ic+2];
817:     t[ii+3] = b[ic+3];
818:     t[ii+4] = b[ic+4];
819:     ii += 5;
820:   }

822:   /* forward solve the U^T */
823:   idx = 0;
824:   for (i=0; i<n; i++) {

826:     v     = aa + 25*diag[i];
827:     /* multiply by the inverse of the block diagonal */
828:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
829:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
830:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
831:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
832:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
833:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
834:     v += 25;

836:     vi    = aj + diag[i] + 1;
837:     nz    = ai[i+1] - diag[i] - 1;
838:     while (nz--) {
839:       oidx = 5*(*vi++);
840:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
841:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
842:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
843:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
844:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
845:       v  += 25;
846:     }
847:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
848:     idx += 5;
849:   }
850:   /* backward solve the L^T */
851:   for (i=n-1; i>=0; i--){
852:     v    = aa + 25*diag[i] - 25;
853:     vi   = aj + diag[i] - 1;
854:     nz   = diag[i] - ai[i];
855:     idt  = 5*i;
856:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
857:     while (nz--) {
858:       idx   = 5*(*vi--);
859:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
860:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
861:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
862:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
863:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
864:       v -= 25;
865:     }
866:   }

868:   /* copy t into x according to permutation */
869:   ii = 0;
870:   for (i=0; i<n; i++) {
871:     ir      = 5*r[i];
872:     x[ir]   = t[ii];
873:     x[ir+1] = t[ii+1];
874:     x[ir+2] = t[ii+2];
875:     x[ir+3] = t[ii+3];
876:     x[ir+4] = t[ii+4];
877:     ii += 5;
878:   }

880:   ISRestoreIndices(isrow,&rout);
881:   ISRestoreIndices(iscol,&cout);
882:   VecRestoreArray(bb,&b);
883:   VecRestoreArray(xx,&x);
884:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
885:   return(0);
886: }

890: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
891: {
892:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
893:   IS             iscol=a->col,isrow=a->row;
895:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
896:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
897:   MatScalar      *aa=a->a,*v;
898:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
899:   PetscScalar    *x,*b,*t;

902:   VecGetArray(bb,&b);
903:   VecGetArray(xx,&x);
904:   t  = a->solve_work;

906:   ISGetIndices(isrow,&rout); r = rout;
907:   ISGetIndices(iscol,&cout); c = cout;

909:   /* copy the b into temp work space according to permutation */
910:   ii = 0;
911:   for (i=0; i<n; i++) {
912:     ic      = 6*c[i];
913:     t[ii]   = b[ic];
914:     t[ii+1] = b[ic+1];
915:     t[ii+2] = b[ic+2];
916:     t[ii+3] = b[ic+3];
917:     t[ii+4] = b[ic+4];
918:     t[ii+5] = b[ic+5];
919:     ii += 6;
920:   }

922:   /* forward solve the U^T */
923:   idx = 0;
924:   for (i=0; i<n; i++) {

926:     v     = aa + 36*diag[i];
927:     /* multiply by the inverse of the block diagonal */
928:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
929:     x6    = t[5+idx];
930:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
931:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
932:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
933:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
934:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
935:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
936:     v += 36;

938:     vi    = aj + diag[i] + 1;
939:     nz    = ai[i+1] - diag[i] - 1;
940:     while (nz--) {
941:       oidx = 6*(*vi++);
942:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
943:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
944:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
945:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
946:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
947:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
948:       v  += 36;
949:     }
950:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
951:     t[5+idx] = s6;
952:     idx += 6;
953:   }
954:   /* backward solve the L^T */
955:   for (i=n-1; i>=0; i--){
956:     v    = aa + 36*diag[i] - 36;
957:     vi   = aj + diag[i] - 1;
958:     nz   = diag[i] - ai[i];
959:     idt  = 6*i;
960:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
961:     s6 = t[5+idt];
962:     while (nz--) {
963:       idx   = 6*(*vi--);
964:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
965:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
966:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
967:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
968:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
969:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
970:       v -= 36;
971:     }
972:   }

974:   /* copy t into x according to permutation */
975:   ii = 0;
976:   for (i=0; i<n; i++) {
977:     ir      = 6*r[i];
978:     x[ir]   = t[ii];
979:     x[ir+1] = t[ii+1];
980:     x[ir+2] = t[ii+2];
981:     x[ir+3] = t[ii+3];
982:     x[ir+4] = t[ii+4];
983:     x[ir+5] = t[ii+5];
984:     ii += 6;
985:   }

987:   ISRestoreIndices(isrow,&rout);
988:   ISRestoreIndices(iscol,&cout);
989:   VecRestoreArray(bb,&b);
990:   VecRestoreArray(xx,&x);
991:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
992:   return(0);
993: }

997: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
998: {
999:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1000:   IS             iscol=a->col,isrow=a->row;
1002:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1003:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1004:   MatScalar      *aa=a->a,*v;
1005:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1006:   PetscScalar    *x,*b,*t;

1009:   VecGetArray(bb,&b);
1010:   VecGetArray(xx,&x);
1011:   t  = a->solve_work;

1013:   ISGetIndices(isrow,&rout); r = rout;
1014:   ISGetIndices(iscol,&cout); c = cout;

1016:   /* copy the b into temp work space according to permutation */
1017:   ii = 0;
1018:   for (i=0; i<n; i++) {
1019:     ic      = 7*c[i];
1020:     t[ii]   = b[ic];
1021:     t[ii+1] = b[ic+1];
1022:     t[ii+2] = b[ic+2];
1023:     t[ii+3] = b[ic+3];
1024:     t[ii+4] = b[ic+4];
1025:     t[ii+5] = b[ic+5];
1026:     t[ii+6] = b[ic+6];
1027:     ii += 7;
1028:   }

1030:   /* forward solve the U^T */
1031:   idx = 0;
1032:   for (i=0; i<n; i++) {

1034:     v     = aa + 49*diag[i];
1035:     /* multiply by the inverse of the block diagonal */
1036:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1037:     x6    = t[5+idx]; x7 = t[6+idx];
1038:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1039:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1040:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1041:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1042:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1043:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1044:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1045:     v += 49;

1047:     vi    = aj + diag[i] + 1;
1048:     nz    = ai[i+1] - diag[i] - 1;
1049:     while (nz--) {
1050:       oidx = 7*(*vi++);
1051:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1052:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1053:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1054:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1055:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1056:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1057:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1058:       v  += 49;
1059:     }
1060:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1061:     t[5+idx] = s6;t[6+idx] = s7;
1062:     idx += 7;
1063:   }
1064:   /* backward solve the L^T */
1065:   for (i=n-1; i>=0; i--){
1066:     v    = aa + 49*diag[i] - 49;
1067:     vi   = aj + diag[i] - 1;
1068:     nz   = diag[i] - ai[i];
1069:     idt  = 7*i;
1070:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1071:     s6 = t[5+idt];s7 = t[6+idt];
1072:     while (nz--) {
1073:       idx   = 7*(*vi--);
1074:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1075:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1076:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1077:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1078:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1079:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1080:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1081:       v -= 49;
1082:     }
1083:   }

1085:   /* copy t into x according to permutation */
1086:   ii = 0;
1087:   for (i=0; i<n; i++) {
1088:     ir      = 7*r[i];
1089:     x[ir]   = t[ii];
1090:     x[ir+1] = t[ii+1];
1091:     x[ir+2] = t[ii+2];
1092:     x[ir+3] = t[ii+3];
1093:     x[ir+4] = t[ii+4];
1094:     x[ir+5] = t[ii+5];
1095:     x[ir+6] = t[ii+6];
1096:     ii += 7;
1097:   }

1099:   ISRestoreIndices(isrow,&rout);
1100:   ISRestoreIndices(iscol,&cout);
1101:   VecRestoreArray(bb,&b);
1102:   VecRestoreArray(xx,&x);
1103:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1104:   return(0);
1105: }

1107: /* ----------------------------------------------------------- */
1110: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1111: {
1112:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1113:   IS             iscol=a->col,isrow=a->row;
1115:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1116:   PetscInt       nz,bs=A->bs,bs2=a->bs2,*rout,*cout;
1117:   MatScalar      *aa=a->a,*v;
1118:   PetscScalar    *x,*b,*s,*t,*ls;

1121:   VecGetArray(bb,&b);
1122:   VecGetArray(xx,&x);
1123:   t  = a->solve_work;

1125:   ISGetIndices(isrow,&rout); r = rout;
1126:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1128:   /* forward solve the lower triangular */
1129:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1130:   for (i=1; i<n; i++) {
1131:     v   = aa + bs2*ai[i];
1132:     vi  = aj + ai[i];
1133:     nz  = a->diag[i] - ai[i];
1134:     s = t + bs*i;
1135:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1136:     while (nz--) {
1137:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1138:       v += bs2;
1139:     }
1140:   }
1141:   /* backward solve the upper triangular */
1142:   ls = a->solve_work + A->n;
1143:   for (i=n-1; i>=0; i--){
1144:     v   = aa + bs2*(a->diag[i] + 1);
1145:     vi  = aj + a->diag[i] + 1;
1146:     nz  = ai[i+1] - a->diag[i] - 1;
1147:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1148:     while (nz--) {
1149:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1150:       v += bs2;
1151:     }
1152:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1153:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1154:   }

1156:   ISRestoreIndices(isrow,&rout);
1157:   ISRestoreIndices(iscol,&cout);
1158:   VecRestoreArray(bb,&b);
1159:   VecRestoreArray(xx,&x);
1160:   PetscLogFlops(2*(a->bs2)*(a->nz) - A->bs*A->n);
1161:   return(0);
1162: }

1166: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1167: {
1168:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1169:   IS             iscol=a->col,isrow=a->row;
1171:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1172:   PetscInt       *diag = a->diag;
1173:   MatScalar      *aa=a->a,*v;
1174:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1175:   PetscScalar    *x,*b,*t;

1178:   VecGetArray(bb,&b);
1179:   VecGetArray(xx,&x);
1180:   t  = a->solve_work;

1182:   ISGetIndices(isrow,&rout); r = rout;
1183:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1185:   /* forward solve the lower triangular */
1186:   idx    = 7*(*r++);
1187:   t[0] = b[idx];   t[1] = b[1+idx];
1188:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1189:   t[5] = b[5+idx]; t[6] = b[6+idx];

1191:   for (i=1; i<n; i++) {
1192:     v     = aa + 49*ai[i];
1193:     vi    = aj + ai[i];
1194:     nz    = diag[i] - ai[i];
1195:     idx   = 7*(*r++);
1196:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1197:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1198:     while (nz--) {
1199:       idx   = 7*(*vi++);
1200:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1201:       x4    = t[3+idx];x5 = t[4+idx];
1202:       x6    = t[5+idx];x7 = t[6+idx];
1203:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1204:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1205:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1206:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1207:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1208:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1209:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1210:       v += 49;
1211:     }
1212:     idx = 7*i;
1213:     t[idx]   = s1;t[1+idx] = s2;
1214:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1215:     t[5+idx] = s6;t[6+idx] = s7;
1216:   }
1217:   /* backward solve the upper triangular */
1218:   for (i=n-1; i>=0; i--){
1219:     v    = aa + 49*diag[i] + 49;
1220:     vi   = aj + diag[i] + 1;
1221:     nz   = ai[i+1] - diag[i] - 1;
1222:     idt  = 7*i;
1223:     s1 = t[idt];  s2 = t[1+idt];
1224:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1225:     s6 = t[5+idt];s7 = t[6+idt];
1226:     while (nz--) {
1227:       idx   = 7*(*vi++);
1228:       x1    = t[idx];   x2 = t[1+idx];
1229:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1230:       x6    = t[5+idx]; x7 = t[6+idx];
1231:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1232:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1233:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1234:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1235:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1236:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1237:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1238:       v += 49;
1239:     }
1240:     idc = 7*(*c--);
1241:     v   = aa + 49*diag[i];
1242:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1243:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1244:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1245:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1246:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1247:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1248:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1249:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1250:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1251:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1252:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1253:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1254:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1255:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1256:   }

1258:   ISRestoreIndices(isrow,&rout);
1259:   ISRestoreIndices(iscol,&cout);
1260:   VecRestoreArray(bb,&b);
1261:   VecRestoreArray(xx,&x);
1262:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1263:   return(0);
1264: }

1268: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1269: {
1270:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1271:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1273:   PetscInt       *diag = a->diag,jdx;
1274:   MatScalar      *aa=a->a,*v;
1275:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1278:   VecGetArray(bb,&b);
1279:   VecGetArray(xx,&x);
1280:   /* forward solve the lower triangular */
1281:   idx    = 0;
1282:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1283:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1284:   x[6] = b[6+idx];
1285:   for (i=1; i<n; i++) {
1286:     v     =  aa + 49*ai[i];
1287:     vi    =  aj + ai[i];
1288:     nz    =  diag[i] - ai[i];
1289:     idx   =  7*i;
1290:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1291:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1292:     s7  =  b[6+idx];
1293:     while (nz--) {
1294:       jdx   = 7*(*vi++);
1295:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1296:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1297:       x7    = x[6+jdx];
1298:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1299:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1300:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1301:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1302:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1303:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1304:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1305:       v += 49;
1306:      }
1307:     x[idx]   = s1;
1308:     x[1+idx] = s2;
1309:     x[2+idx] = s3;
1310:     x[3+idx] = s4;
1311:     x[4+idx] = s5;
1312:     x[5+idx] = s6;
1313:     x[6+idx] = s7;
1314:   }
1315:   /* backward solve the upper triangular */
1316:   for (i=n-1; i>=0; i--){
1317:     v    = aa + 49*diag[i] + 49;
1318:     vi   = aj + diag[i] + 1;
1319:     nz   = ai[i+1] - diag[i] - 1;
1320:     idt  = 7*i;
1321:     s1 = x[idt];   s2 = x[1+idt];
1322:     s3 = x[2+idt]; s4 = x[3+idt];
1323:     s5 = x[4+idt]; s6 = x[5+idt];
1324:     s7 = x[6+idt];
1325:     while (nz--) {
1326:       idx   = 7*(*vi++);
1327:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1328:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1329:       x7    = x[6+idx];
1330:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1331:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1332:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1333:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1334:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1335:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1336:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1337:       v += 49;
1338:     }
1339:     v        = aa + 49*diag[i];
1340:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1341:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1342:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1343:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1344:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1345:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1346:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1347:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1348:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1349:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1350:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1351:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1352:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1353:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1354:   }

1356:   VecRestoreArray(bb,&b);
1357:   VecRestoreArray(xx,&x);
1358:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1359:   return(0);
1360: }

1364: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1365: {
1366:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1367:   IS             iscol=a->col,isrow=a->row;
1369:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1370:   PetscInt       *diag = a->diag;
1371:   MatScalar      *aa=a->a,*v;
1372:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1375:   VecGetArray(bb,&b);
1376:   VecGetArray(xx,&x);
1377:   t  = a->solve_work;

1379:   ISGetIndices(isrow,&rout); r = rout;
1380:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1382:   /* forward solve the lower triangular */
1383:   idx    = 6*(*r++);
1384:   t[0] = b[idx];   t[1] = b[1+idx];
1385:   t[2] = b[2+idx]; t[3] = b[3+idx];
1386:   t[4] = b[4+idx]; t[5] = b[5+idx];
1387:   for (i=1; i<n; i++) {
1388:     v     = aa + 36*ai[i];
1389:     vi    = aj + ai[i];
1390:     nz    = diag[i] - ai[i];
1391:     idx   = 6*(*r++);
1392:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1393:     s5  = b[4+idx]; s6 = b[5+idx];
1394:     while (nz--) {
1395:       idx   = 6*(*vi++);
1396:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1397:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1398:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1399:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1400:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1401:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1402:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1403:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1404:       v += 36;
1405:     }
1406:     idx = 6*i;
1407:     t[idx]   = s1;t[1+idx] = s2;
1408:     t[2+idx] = s3;t[3+idx] = s4;
1409:     t[4+idx] = s5;t[5+idx] = s6;
1410:   }
1411:   /* backward solve the upper triangular */
1412:   for (i=n-1; i>=0; i--){
1413:     v    = aa + 36*diag[i] + 36;
1414:     vi   = aj + diag[i] + 1;
1415:     nz   = ai[i+1] - diag[i] - 1;
1416:     idt  = 6*i;
1417:     s1 = t[idt];  s2 = t[1+idt];
1418:     s3 = t[2+idt];s4 = t[3+idt];
1419:     s5 = t[4+idt];s6 = t[5+idt];
1420:     while (nz--) {
1421:       idx   = 6*(*vi++);
1422:       x1    = t[idx];   x2 = t[1+idx];
1423:       x3    = t[2+idx]; x4 = t[3+idx];
1424:       x5    = t[4+idx]; x6 = t[5+idx];
1425:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1426:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1427:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1428:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1429:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1430:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1431:       v += 36;
1432:     }
1433:     idc = 6*(*c--);
1434:     v   = aa + 36*diag[i];
1435:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1436:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1437:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1438:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1439:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1440:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1441:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1442:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1443:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1444:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1445:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1446:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1447:   }

1449:   ISRestoreIndices(isrow,&rout);
1450:   ISRestoreIndices(iscol,&cout);
1451:   VecRestoreArray(bb,&b);
1452:   VecRestoreArray(xx,&x);
1453:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1454:   return(0);
1455: }

1459: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1460: {
1461:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1462:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1464:   PetscInt       *diag = a->diag,jdx;
1465:   MatScalar      *aa=a->a,*v;
1466:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1469:   VecGetArray(bb,&b);
1470:   VecGetArray(xx,&x);
1471:   /* forward solve the lower triangular */
1472:   idx    = 0;
1473:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1474:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1475:   for (i=1; i<n; i++) {
1476:     v     =  aa + 36*ai[i];
1477:     vi    =  aj + ai[i];
1478:     nz    =  diag[i] - ai[i];
1479:     idx   =  6*i;
1480:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1481:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1482:     while (nz--) {
1483:       jdx   = 6*(*vi++);
1484:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1485:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1486:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1487:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1488:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1489:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1490:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1491:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1492:       v += 36;
1493:      }
1494:     x[idx]   = s1;
1495:     x[1+idx] = s2;
1496:     x[2+idx] = s3;
1497:     x[3+idx] = s4;
1498:     x[4+idx] = s5;
1499:     x[5+idx] = s6;
1500:   }
1501:   /* backward solve the upper triangular */
1502:   for (i=n-1; i>=0; i--){
1503:     v    = aa + 36*diag[i] + 36;
1504:     vi   = aj + diag[i] + 1;
1505:     nz   = ai[i+1] - diag[i] - 1;
1506:     idt  = 6*i;
1507:     s1 = x[idt];   s2 = x[1+idt];
1508:     s3 = x[2+idt]; s4 = x[3+idt];
1509:     s5 = x[4+idt]; s6 = x[5+idt];
1510:     while (nz--) {
1511:       idx   = 6*(*vi++);
1512:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1513:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1514:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1515:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1516:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1517:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1518:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1519:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1520:       v += 36;
1521:     }
1522:     v        = aa + 36*diag[i];
1523:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1524:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1525:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1526:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1527:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1528:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1529:   }

1531:   VecRestoreArray(bb,&b);
1532:   VecRestoreArray(xx,&x);
1533:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1534:   return(0);
1535: }

1539: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1540: {
1541:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1542:   IS             iscol=a->col,isrow=a->row;
1544:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1545:   PetscInt       *diag = a->diag;
1546:   MatScalar      *aa=a->a,*v;
1547:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1550:   VecGetArray(bb,&b);
1551:   VecGetArray(xx,&x);
1552:   t  = a->solve_work;

1554:   ISGetIndices(isrow,&rout); r = rout;
1555:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1557:   /* forward solve the lower triangular */
1558:   idx    = 5*(*r++);
1559:   t[0] = b[idx];   t[1] = b[1+idx];
1560:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1561:   for (i=1; i<n; i++) {
1562:     v     = aa + 25*ai[i];
1563:     vi    = aj + ai[i];
1564:     nz    = diag[i] - ai[i];
1565:     idx   = 5*(*r++);
1566:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1567:     s5  = b[4+idx];
1568:     while (nz--) {
1569:       idx   = 5*(*vi++);
1570:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1571:       x4    = t[3+idx];x5 = t[4+idx];
1572:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1573:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1574:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1575:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1576:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1577:       v += 25;
1578:     }
1579:     idx = 5*i;
1580:     t[idx]   = s1;t[1+idx] = s2;
1581:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1582:   }
1583:   /* backward solve the upper triangular */
1584:   for (i=n-1; i>=0; i--){
1585:     v    = aa + 25*diag[i] + 25;
1586:     vi   = aj + diag[i] + 1;
1587:     nz   = ai[i+1] - diag[i] - 1;
1588:     idt  = 5*i;
1589:     s1 = t[idt];  s2 = t[1+idt];
1590:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1591:     while (nz--) {
1592:       idx   = 5*(*vi++);
1593:       x1    = t[idx];   x2 = t[1+idx];
1594:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1595:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1596:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1597:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1598:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1599:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1600:       v += 25;
1601:     }
1602:     idc = 5*(*c--);
1603:     v   = aa + 25*diag[i];
1604:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1605:                                  v[15]*s4+v[20]*s5;
1606:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1607:                                  v[16]*s4+v[21]*s5;
1608:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1609:                                  v[17]*s4+v[22]*s5;
1610:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1611:                                  v[18]*s4+v[23]*s5;
1612:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1613:                                  v[19]*s4+v[24]*s5;
1614:   }

1616:   ISRestoreIndices(isrow,&rout);
1617:   ISRestoreIndices(iscol,&cout);
1618:   VecRestoreArray(bb,&b);
1619:   VecRestoreArray(xx,&x);
1620:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1621:   return(0);
1622: }

1626: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1627: {
1628:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1629:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1631:   PetscInt       *diag = a->diag,jdx;
1632:   MatScalar      *aa=a->a,*v;
1633:   PetscScalar    *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1636:   VecGetArray(bb,&b);
1637:   VecGetArray(xx,&x);
1638:   /* forward solve the lower triangular */
1639:   idx    = 0;
1640:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1641:   for (i=1; i<n; i++) {
1642:     v     =  aa + 25*ai[i];
1643:     vi    =  aj + ai[i];
1644:     nz    =  diag[i] - ai[i];
1645:     idx   =  5*i;
1646:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1647:     while (nz--) {
1648:       jdx   = 5*(*vi++);
1649:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1650:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1651:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1652:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1653:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1654:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1655:       v    += 25;
1656:     }
1657:     x[idx]   = s1;
1658:     x[1+idx] = s2;
1659:     x[2+idx] = s3;
1660:     x[3+idx] = s4;
1661:     x[4+idx] = s5;
1662:   }
1663:   /* backward solve the upper triangular */
1664:   for (i=n-1; i>=0; i--){
1665:     v    = aa + 25*diag[i] + 25;
1666:     vi   = aj + diag[i] + 1;
1667:     nz   = ai[i+1] - diag[i] - 1;
1668:     idt  = 5*i;
1669:     s1 = x[idt];  s2 = x[1+idt];
1670:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1671:     while (nz--) {
1672:       idx   = 5*(*vi++);
1673:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1674:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1675:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1676:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1677:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1678:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1679:       v    += 25;
1680:     }
1681:     v        = aa + 25*diag[i];
1682:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1683:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1684:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1685:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1686:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1687:   }

1689:   VecRestoreArray(bb,&b);
1690:   VecRestoreArray(xx,&x);
1691:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1692:   return(0);
1693: }

1697: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1698: {
1699:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1700:   IS             iscol=a->col,isrow=a->row;
1702:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1703:   PetscInt       *diag = a->diag;
1704:   MatScalar      *aa=a->a,*v;
1705:   PetscScalar    *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1708:   VecGetArray(bb,&b);
1709:   VecGetArray(xx,&x);
1710:   t  = a->solve_work;

1712:   ISGetIndices(isrow,&rout); r = rout;
1713:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1715:   /* forward solve the lower triangular */
1716:   idx    = 4*(*r++);
1717:   t[0] = b[idx];   t[1] = b[1+idx];
1718:   t[2] = b[2+idx]; t[3] = b[3+idx];
1719:   for (i=1; i<n; i++) {
1720:     v     = aa + 16*ai[i];
1721:     vi    = aj + ai[i];
1722:     nz    = diag[i] - ai[i];
1723:     idx   = 4*(*r++);
1724:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1725:     while (nz--) {
1726:       idx   = 4*(*vi++);
1727:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1728:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1729:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1730:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1731:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1732:       v    += 16;
1733:     }
1734:     idx        = 4*i;
1735:     t[idx]   = s1;t[1+idx] = s2;
1736:     t[2+idx] = s3;t[3+idx] = s4;
1737:   }
1738:   /* backward solve the upper triangular */
1739:   for (i=n-1; i>=0; i--){
1740:     v    = aa + 16*diag[i] + 16;
1741:     vi   = aj + diag[i] + 1;
1742:     nz   = ai[i+1] - diag[i] - 1;
1743:     idt  = 4*i;
1744:     s1 = t[idt];  s2 = t[1+idt];
1745:     s3 = t[2+idt];s4 = t[3+idt];
1746:     while (nz--) {
1747:       idx   = 4*(*vi++);
1748:       x1    = t[idx];   x2 = t[1+idx];
1749:       x3    = t[2+idx]; x4 = t[3+idx];
1750:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1751:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1752:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1753:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1754:       v += 16;
1755:     }
1756:     idc      = 4*(*c--);
1757:     v        = aa + 16*diag[i];
1758:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1759:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1760:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1761:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1762:   }

1764:   ISRestoreIndices(isrow,&rout);
1765:   ISRestoreIndices(iscol,&cout);
1766:   VecRestoreArray(bb,&b);
1767:   VecRestoreArray(xx,&x);
1768:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1769:   return(0);
1770: }

1774: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1775: {
1776:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1777:   IS             iscol=a->col,isrow=a->row;
1779:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1780:   PetscInt       *diag = a->diag;
1781:   MatScalar      *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1782:   PetscScalar    *x,*b;

1785:   VecGetArray(bb,&b);
1786:   VecGetArray(xx,&x);
1787:   t  = (MatScalar *)a->solve_work;

1789:   ISGetIndices(isrow,&rout); r = rout;
1790:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1792:   /* forward solve the lower triangular */
1793:   idx    = 4*(*r++);
1794:   t[0] = (MatScalar)b[idx];
1795:   t[1] = (MatScalar)b[1+idx];
1796:   t[2] = (MatScalar)b[2+idx];
1797:   t[3] = (MatScalar)b[3+idx];
1798:   for (i=1; i<n; i++) {
1799:     v     = aa + 16*ai[i];
1800:     vi    = aj + ai[i];
1801:     nz    = diag[i] - ai[i];
1802:     idx   = 4*(*r++);
1803:     s1 = (MatScalar)b[idx];
1804:     s2 = (MatScalar)b[1+idx];
1805:     s3 = (MatScalar)b[2+idx];
1806:     s4 = (MatScalar)b[3+idx];
1807:     while (nz--) {
1808:       idx   = 4*(*vi++);
1809:       x1  = t[idx];
1810:       x2  = t[1+idx];
1811:       x3  = t[2+idx];
1812:       x4  = t[3+idx];
1813:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1814:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1815:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1817:       v    += 16;
1818:     }
1819:     idx        = 4*i;
1820:     t[idx]   = s1;
1821:     t[1+idx] = s2;
1822:     t[2+idx] = s3;
1823:     t[3+idx] = s4;
1824:   }
1825:   /* backward solve the upper triangular */
1826:   for (i=n-1; i>=0; i--){
1827:     v    = aa + 16*diag[i] + 16;
1828:     vi   = aj + diag[i] + 1;
1829:     nz   = ai[i+1] - diag[i] - 1;
1830:     idt  = 4*i;
1831:     s1 = t[idt];
1832:     s2 = t[1+idt];
1833:     s3 = t[2+idt];
1834:     s4 = t[3+idt];
1835:     while (nz--) {
1836:       idx   = 4*(*vi++);
1837:       x1  = t[idx];
1838:       x2  = t[1+idx];
1839:       x3  = t[2+idx];
1840:       x4  = t[3+idx];
1841:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1842:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1843:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1844:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1845:       v += 16;
1846:     }
1847:     idc      = 4*(*c--);
1848:     v        = aa + 16*diag[i];
1849:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1850:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1851:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1852:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1853:     x[idc]   = (PetscScalar)t[idt];
1854:     x[1+idc] = (PetscScalar)t[1+idt];
1855:     x[2+idc] = (PetscScalar)t[2+idt];
1856:     x[3+idc] = (PetscScalar)t[3+idt];
1857:  }

1859:   ISRestoreIndices(isrow,&rout);
1860:   ISRestoreIndices(iscol,&cout);
1861:   VecRestoreArray(bb,&b);
1862:   VecRestoreArray(xx,&x);
1863:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1864:   return(0);
1865: }

1867: #if defined (PETSC_HAVE_SSE)

1869: #include PETSC_HAVE_SSE

1873: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1874: {
1875:   /* 
1876:      Note: This code uses demotion of double
1877:      to float when performing the mixed-mode computation.
1878:      This may not be numerically reasonable for all applications.
1879:   */
1880:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1881:   IS             iscol=a->col,isrow=a->row;
1883:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1884:   PetscInt       *diag = a->diag,ai16;
1885:   MatScalar      *aa=a->a,*v;
1886:   PetscScalar    *x,*b,*t;

1888:   /* Make space in temp stack for 16 Byte Aligned arrays */
1889:   float           ssealignedspace[11],*tmps,*tmpx;
1890:   unsigned long   offset;
1891: 
1893:   SSE_SCOPE_BEGIN;

1895:     offset = (unsigned long)ssealignedspace % 16;
1896:     if (offset) offset = (16 - offset)/4;
1897:     tmps = &ssealignedspace[offset];
1898:     tmpx = &ssealignedspace[offset+4];
1899:     PREFETCH_NTA(aa+16*ai[1]);

1901:     VecGetArray(bb,&b);
1902:     VecGetArray(xx,&x);
1903:     t  = a->solve_work;

1905:     ISGetIndices(isrow,&rout); r = rout;
1906:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1908:     /* forward solve the lower triangular */
1909:     idx  = 4*(*r++);
1910:     t[0] = b[idx];   t[1] = b[1+idx];
1911:     t[2] = b[2+idx]; t[3] = b[3+idx];
1912:     v    =  aa + 16*ai[1];

1914:     for (i=1; i<n;) {
1915:       PREFETCH_NTA(&v[8]);
1916:       vi   =  aj      + ai[i];
1917:       nz   =  diag[i] - ai[i];
1918:       idx  =  4*(*r++);

1920:       /* Demote sum from double to float */
1921:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1922:       LOAD_PS(tmps,XMM7);

1924:       while (nz--) {
1925:         PREFETCH_NTA(&v[16]);
1926:         idx = 4*(*vi++);
1927: 
1928:         /* Demote solution (so far) from double to float */
1929:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1931:         /* 4x4 Matrix-Vector product with negative accumulation: */
1932:         SSE_INLINE_BEGIN_2(tmpx,v)
1933:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1935:           /* First Column */
1936:           SSE_COPY_PS(XMM0,XMM6)
1937:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1938:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1939:           SSE_SUB_PS(XMM7,XMM0)
1940: 
1941:           /* Second Column */
1942:           SSE_COPY_PS(XMM1,XMM6)
1943:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1944:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1945:           SSE_SUB_PS(XMM7,XMM1)
1946: 
1947:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1948: 
1949:           /* Third Column */
1950:           SSE_COPY_PS(XMM2,XMM6)
1951:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1952:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1953:           SSE_SUB_PS(XMM7,XMM2)

1955:           /* Fourth Column */
1956:           SSE_COPY_PS(XMM3,XMM6)
1957:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1958:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1959:           SSE_SUB_PS(XMM7,XMM3)
1960:         SSE_INLINE_END_2
1961: 
1962:         v  += 16;
1963:       }
1964:       idx = 4*i;
1965:       v   = aa + 16*ai[++i];
1966:       PREFETCH_NTA(v);
1967:       STORE_PS(tmps,XMM7);

1969:       /* Promote result from float to double */
1970:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1971:     }
1972:     /* backward solve the upper triangular */
1973:     idt  = 4*(n-1);
1974:     ai16 = 16*diag[n-1];
1975:     v    = aa + ai16 + 16;
1976:     for (i=n-1; i>=0;){
1977:       PREFETCH_NTA(&v[8]);
1978:       vi = aj + diag[i] + 1;
1979:       nz = ai[i+1] - diag[i] - 1;
1980: 
1981:       /* Demote accumulator from double to float */
1982:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1983:       LOAD_PS(tmps,XMM7);

1985:       while (nz--) {
1986:         PREFETCH_NTA(&v[16]);
1987:         idx = 4*(*vi++);

1989:         /* Demote solution (so far) from double to float */
1990:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1992:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1993:         SSE_INLINE_BEGIN_2(tmpx,v)
1994:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1996:           /* First Column */
1997:           SSE_COPY_PS(XMM0,XMM6)
1998:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1999:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2000:           SSE_SUB_PS(XMM7,XMM0)

2002:           /* Second Column */
2003:           SSE_COPY_PS(XMM1,XMM6)
2004:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2005:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2006:           SSE_SUB_PS(XMM7,XMM1)

2008:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2009: 
2010:           /* Third Column */
2011:           SSE_COPY_PS(XMM2,XMM6)
2012:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2013:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2014:           SSE_SUB_PS(XMM7,XMM2)

2016:           /* Fourth Column */
2017:           SSE_COPY_PS(XMM3,XMM6)
2018:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2019:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2020:           SSE_SUB_PS(XMM7,XMM3)
2021:         SSE_INLINE_END_2
2022:         v  += 16;
2023:       }
2024:       v    = aa + ai16;
2025:       ai16 = 16*diag[--i];
2026:       PREFETCH_NTA(aa+ai16+16);
2027:       /* 
2028:          Scale the result by the diagonal 4x4 block, 
2029:          which was inverted as part of the factorization
2030:       */
2031:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2032:         /* First Column */
2033:         SSE_COPY_PS(XMM0,XMM7)
2034:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2035:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2037:         /* Second Column */
2038:         SSE_COPY_PS(XMM1,XMM7)
2039:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2040:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2041:         SSE_ADD_PS(XMM0,XMM1)

2043:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2044: 
2045:         /* Third Column */
2046:         SSE_COPY_PS(XMM2,XMM7)
2047:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2048:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2049:         SSE_ADD_PS(XMM0,XMM2)

2051:         /* Fourth Column */
2052:         SSE_COPY_PS(XMM3,XMM7)
2053:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2054:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2055:         SSE_ADD_PS(XMM0,XMM3)
2056: 
2057:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2058:       SSE_INLINE_END_3

2060:       /* Promote solution from float to double */
2061:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2063:       /* Apply reordering to t and stream into x.    */
2064:       /* This way, x doesn't pollute the cache.      */
2065:       /* Be careful with size: 2 doubles = 4 floats! */
2066:       idc  = 4*(*c--);
2067:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2068:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2069:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2070:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2071:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2072:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2073:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2074:       SSE_INLINE_END_2
2075:       v    = aa + ai16 + 16;
2076:       idt -= 4;
2077:     }

2079:     ISRestoreIndices(isrow,&rout);
2080:     ISRestoreIndices(iscol,&cout);
2081:     VecRestoreArray(bb,&b);
2082:     VecRestoreArray(xx,&x);
2083:     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2084:   SSE_SCOPE_END;
2085:   return(0);
2086: }

2088: #endif


2091: /*
2092:       Special case where the matrix was ILU(0) factored in the natural
2093:    ordering. This eliminates the need for the column and row permutation.
2094: */
2097: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2098: {
2099:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2100:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2102:   PetscInt       *diag = a->diag;
2103:   MatScalar      *aa=a->a;
2104:   PetscScalar    *x,*b;

2107:   VecGetArray(bb,&b);
2108:   VecGetArray(xx,&x);

2110: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2111:   {
2112:     static PetscScalar w[2000]; /* very BAD need to fix */
2113:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2114:   }
2115: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2116:   {
2117:     static PetscScalar w[2000]; /* very BAD need to fix */
2118:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2119:   }
2120: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2121:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2122: #else
2123:   {
2124:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2125:     MatScalar    *v;
2126:     PetscInt          jdx,idt,idx,nz,*vi,i,ai16;

2128:   /* forward solve the lower triangular */
2129:   idx    = 0;
2130:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2131:   for (i=1; i<n; i++) {
2132:     v     =  aa      + 16*ai[i];
2133:     vi    =  aj      + ai[i];
2134:     nz    =  diag[i] - ai[i];
2135:     idx   +=  4;
2136:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2137:     while (nz--) {
2138:       jdx   = 4*(*vi++);
2139:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2140:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2141:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2142:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2143:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2144:       v    += 16;
2145:     }
2146:     x[idx]   = s1;
2147:     x[1+idx] = s2;
2148:     x[2+idx] = s3;
2149:     x[3+idx] = s4;
2150:   }
2151:   /* backward solve the upper triangular */
2152:   idt = 4*(n-1);
2153:   for (i=n-1; i>=0; i--){
2154:     ai16 = 16*diag[i];
2155:     v    = aa + ai16 + 16;
2156:     vi   = aj + diag[i] + 1;
2157:     nz   = ai[i+1] - diag[i] - 1;
2158:     s1 = x[idt];  s2 = x[1+idt];
2159:     s3 = x[2+idt];s4 = x[3+idt];
2160:     while (nz--) {
2161:       idx   = 4*(*vi++);
2162:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2163:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2164:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2165:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2166:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2167:       v    += 16;
2168:     }
2169:     v        = aa + ai16;
2170:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2171:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2172:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2173:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2174:     idt -= 4;
2175:   }
2176:   }
2177: #endif

2179:   VecRestoreArray(bb,&b);
2180:   VecRestoreArray(xx,&x);
2181:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2182:   return(0);
2183: }

2187: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2188: {
2189:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2190:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2192:   PetscInt       *diag = a->diag;
2193:   MatScalar      *aa=a->a;
2194:   PetscScalar    *x,*b;

2197:   VecGetArray(bb,&b);
2198:   VecGetArray(xx,&x);

2200:   {
2201:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2202:     MatScalar  *v,*t=(MatScalar *)x;
2203:     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;

2205:     /* forward solve the lower triangular */
2206:     idx  = 0;
2207:     t[0] = (MatScalar)b[0];
2208:     t[1] = (MatScalar)b[1];
2209:     t[2] = (MatScalar)b[2];
2210:     t[3] = (MatScalar)b[3];
2211:     for (i=1; i<n; i++) {
2212:       v     =  aa      + 16*ai[i];
2213:       vi    =  aj      + ai[i];
2214:       nz    =  diag[i] - ai[i];
2215:       idx   +=  4;
2216:       s1 = (MatScalar)b[idx];
2217:       s2 = (MatScalar)b[1+idx];
2218:       s3 = (MatScalar)b[2+idx];
2219:       s4 = (MatScalar)b[3+idx];
2220:       while (nz--) {
2221:         jdx = 4*(*vi++);
2222:         x1  = t[jdx];
2223:         x2  = t[1+jdx];
2224:         x3  = t[2+jdx];
2225:         x4  = t[3+jdx];
2226:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2227:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2228:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2229:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2230:         v    += 16;
2231:       }
2232:       t[idx]   = s1;
2233:       t[1+idx] = s2;
2234:       t[2+idx] = s3;
2235:       t[3+idx] = s4;
2236:     }
2237:     /* backward solve the upper triangular */
2238:     idt = 4*(n-1);
2239:     for (i=n-1; i>=0; i--){
2240:       ai16 = 16*diag[i];
2241:       v    = aa + ai16 + 16;
2242:       vi   = aj + diag[i] + 1;
2243:       nz   = ai[i+1] - diag[i] - 1;
2244:       s1   = t[idt];
2245:       s2   = t[1+idt];
2246:       s3   = t[2+idt];
2247:       s4   = t[3+idt];
2248:       while (nz--) {
2249:         idx = 4*(*vi++);
2250:         x1  = (MatScalar)x[idx];
2251:         x2  = (MatScalar)x[1+idx];
2252:         x3  = (MatScalar)x[2+idx];
2253:         x4  = (MatScalar)x[3+idx];
2254:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2255:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2256:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2257:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2258:         v    += 16;
2259:       }
2260:       v        = aa + ai16;
2261:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2262:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2263:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2264:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2265:       idt -= 4;
2266:     }
2267:   }

2269:   VecRestoreArray(bb,&b);
2270:   VecRestoreArray(xx,&x);
2271:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2272:   return(0);
2273: }

2275: #if defined (PETSC_HAVE_SSE)

2277: #include PETSC_HAVE_SSE
2280: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2281: {
2282:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2283:   unsigned short *aj=(unsigned short *)a->j;
2285:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2286:   MatScalar      *aa=a->a;
2287:   PetscScalar    *x,*b;

2290:   SSE_SCOPE_BEGIN;
2291:   /* 
2292:      Note: This code currently uses demotion of double
2293:      to float when performing the mixed-mode computation.
2294:      This may not be numerically reasonable for all applications.
2295:   */
2296:   PREFETCH_NTA(aa+16*ai[1]);

2298:   VecGetArray(bb,&b);
2299:   VecGetArray(xx,&x);
2300:   {
2301:     /* x will first be computed in single precision then promoted inplace to double */
2302:     MatScalar      *v,*t=(MatScalar *)x;
2303:     int            nz,i,idt,ai16;
2304:     unsigned int   jdx,idx;
2305:     unsigned short *vi;
2306:     /* Forward solve the lower triangular factor. */

2308:     /* First block is the identity. */
2309:     idx  = 0;
2310:     CONVERT_DOUBLE4_FLOAT4(t,b);
2311:     v    =  aa + 16*((unsigned int)ai[1]);

2313:     for (i=1; i<n;) {
2314:       PREFETCH_NTA(&v[8]);
2315:       vi   =  aj      + ai[i];
2316:       nz   =  diag[i] - ai[i];
2317:       idx +=  4;

2319:       /* Demote RHS from double to float. */
2320:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2321:       LOAD_PS(&t[idx],XMM7);

2323:       while (nz--) {
2324:         PREFETCH_NTA(&v[16]);
2325:         jdx = 4*((unsigned int)(*vi++));
2326: 
2327:         /* 4x4 Matrix-Vector product with negative accumulation: */
2328:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2329:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2331:           /* First Column */
2332:           SSE_COPY_PS(XMM0,XMM6)
2333:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2334:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2335:           SSE_SUB_PS(XMM7,XMM0)

2337:           /* Second Column */
2338:           SSE_COPY_PS(XMM1,XMM6)
2339:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2340:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2341:           SSE_SUB_PS(XMM7,XMM1)

2343:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2344: 
2345:           /* Third Column */
2346:           SSE_COPY_PS(XMM2,XMM6)
2347:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2348:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2349:           SSE_SUB_PS(XMM7,XMM2)

2351:           /* Fourth Column */
2352:           SSE_COPY_PS(XMM3,XMM6)
2353:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2354:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2355:           SSE_SUB_PS(XMM7,XMM3)
2356:         SSE_INLINE_END_2
2357: 
2358:         v  += 16;
2359:       }
2360:       v    =  aa + 16*ai[++i];
2361:       PREFETCH_NTA(v);
2362:       STORE_PS(&t[idx],XMM7);
2363:     }

2365:     /* Backward solve the upper triangular factor.*/

2367:     idt  = 4*(n-1);
2368:     ai16 = 16*diag[n-1];
2369:     v    = aa + ai16 + 16;
2370:     for (i=n-1; i>=0;){
2371:       PREFETCH_NTA(&v[8]);
2372:       vi = aj + diag[i] + 1;
2373:       nz = ai[i+1] - diag[i] - 1;
2374: 
2375:       LOAD_PS(&t[idt],XMM7);

2377:       while (nz--) {
2378:         PREFETCH_NTA(&v[16]);
2379:         idx = 4*((unsigned int)(*vi++));

2381:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2382:         SSE_INLINE_BEGIN_2(&t[idx],v)
2383:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2385:           /* First Column */
2386:           SSE_COPY_PS(XMM0,XMM6)
2387:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2388:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2389:           SSE_SUB_PS(XMM7,XMM0)

2391:           /* Second Column */
2392:           SSE_COPY_PS(XMM1,XMM6)
2393:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2394:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2395:           SSE_SUB_PS(XMM7,XMM1)

2397:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2398: 
2399:           /* Third Column */
2400:           SSE_COPY_PS(XMM2,XMM6)
2401:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2402:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2403:           SSE_SUB_PS(XMM7,XMM2)

2405:           /* Fourth Column */
2406:           SSE_COPY_PS(XMM3,XMM6)
2407:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2408:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2409:           SSE_SUB_PS(XMM7,XMM3)
2410:         SSE_INLINE_END_2
2411:         v  += 16;
2412:       }
2413:       v    = aa + ai16;
2414:       ai16 = 16*diag[--i];
2415:       PREFETCH_NTA(aa+ai16+16);
2416:       /* 
2417:          Scale the result by the diagonal 4x4 block, 
2418:          which was inverted as part of the factorization
2419:       */
2420:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2421:         /* First Column */
2422:         SSE_COPY_PS(XMM0,XMM7)
2423:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2424:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2426:         /* Second Column */
2427:         SSE_COPY_PS(XMM1,XMM7)
2428:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2429:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2430:         SSE_ADD_PS(XMM0,XMM1)

2432:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2433: 
2434:         /* Third Column */
2435:         SSE_COPY_PS(XMM2,XMM7)
2436:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2437:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2438:         SSE_ADD_PS(XMM0,XMM2)

2440:         /* Fourth Column */
2441:         SSE_COPY_PS(XMM3,XMM7)
2442:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2443:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2444:         SSE_ADD_PS(XMM0,XMM3)

2446:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2447:       SSE_INLINE_END_3

2449:       v    = aa + ai16 + 16;
2450:       idt -= 4;
2451:     }

2453:     /* Convert t from single precision back to double precision (inplace)*/
2454:     idt = 4*(n-1);
2455:     for (i=n-1;i>=0;i--) {
2456:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2457:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2458:       PetscScalar *xtemp=&x[idt];
2459:       MatScalar   *ttemp=&t[idt];
2460:       xtemp[3] = (PetscScalar)ttemp[3];
2461:       xtemp[2] = (PetscScalar)ttemp[2];
2462:       xtemp[1] = (PetscScalar)ttemp[1];
2463:       xtemp[0] = (PetscScalar)ttemp[0];
2464:       idt -= 4;
2465:     }

2467:   } /* End of artificial scope. */
2468:   VecRestoreArray(bb,&b);
2469:   VecRestoreArray(xx,&x);
2470:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2471:   SSE_SCOPE_END;
2472:   return(0);
2473: }

2477: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2478: {
2479:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2480:   int            *aj=a->j;
2482:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2483:   MatScalar      *aa=a->a;
2484:   PetscScalar    *x,*b;

2487:   SSE_SCOPE_BEGIN;
2488:   /* 
2489:      Note: This code currently uses demotion of double
2490:      to float when performing the mixed-mode computation.
2491:      This may not be numerically reasonable for all applications.
2492:   */
2493:   PREFETCH_NTA(aa+16*ai[1]);

2495:   VecGetArray(bb,&b);
2496:   VecGetArray(xx,&x);
2497:   {
2498:     /* x will first be computed in single precision then promoted inplace to double */
2499:     MatScalar *v,*t=(MatScalar *)x;
2500:     int       nz,i,idt,ai16;
2501:     int       jdx,idx;
2502:     int       *vi;
2503:     /* Forward solve the lower triangular factor. */

2505:     /* First block is the identity. */
2506:     idx  = 0;
2507:     CONVERT_DOUBLE4_FLOAT4(t,b);
2508:     v    =  aa + 16*ai[1];

2510:     for (i=1; i<n;) {
2511:       PREFETCH_NTA(&v[8]);
2512:       vi   =  aj      + ai[i];
2513:       nz   =  diag[i] - ai[i];
2514:       idx +=  4;

2516:       /* Demote RHS from double to float. */
2517:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2518:       LOAD_PS(&t[idx],XMM7);

2520:       while (nz--) {
2521:         PREFETCH_NTA(&v[16]);
2522:         jdx = 4*(*vi++);
2523: /*          jdx = *vi++; */
2524: 
2525:         /* 4x4 Matrix-Vector product with negative accumulation: */
2526:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2527:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2529:           /* First Column */
2530:           SSE_COPY_PS(XMM0,XMM6)
2531:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2532:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2533:           SSE_SUB_PS(XMM7,XMM0)

2535:           /* Second Column */
2536:           SSE_COPY_PS(XMM1,XMM6)
2537:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2538:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2539:           SSE_SUB_PS(XMM7,XMM1)

2541:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2542: 
2543:           /* Third Column */
2544:           SSE_COPY_PS(XMM2,XMM6)
2545:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2546:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2547:           SSE_SUB_PS(XMM7,XMM2)

2549:           /* Fourth Column */
2550:           SSE_COPY_PS(XMM3,XMM6)
2551:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2552:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2553:           SSE_SUB_PS(XMM7,XMM3)
2554:         SSE_INLINE_END_2
2555: 
2556:         v  += 16;
2557:       }
2558:       v    =  aa + 16*ai[++i];
2559:       PREFETCH_NTA(v);
2560:       STORE_PS(&t[idx],XMM7);
2561:     }

2563:     /* Backward solve the upper triangular factor.*/

2565:     idt  = 4*(n-1);
2566:     ai16 = 16*diag[n-1];
2567:     v    = aa + ai16 + 16;
2568:     for (i=n-1; i>=0;){
2569:       PREFETCH_NTA(&v[8]);
2570:       vi = aj + diag[i] + 1;
2571:       nz = ai[i+1] - diag[i] - 1;
2572: 
2573:       LOAD_PS(&t[idt],XMM7);

2575:       while (nz--) {
2576:         PREFETCH_NTA(&v[16]);
2577:         idx = 4*(*vi++);
2578: /*          idx = *vi++; */

2580:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2581:         SSE_INLINE_BEGIN_2(&t[idx],v)
2582:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2584:           /* First Column */
2585:           SSE_COPY_PS(XMM0,XMM6)
2586:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2587:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2588:           SSE_SUB_PS(XMM7,XMM0)

2590:           /* Second Column */
2591:           SSE_COPY_PS(XMM1,XMM6)
2592:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2593:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2594:           SSE_SUB_PS(XMM7,XMM1)

2596:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2597: 
2598:           /* Third Column */
2599:           SSE_COPY_PS(XMM2,XMM6)
2600:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2601:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2602:           SSE_SUB_PS(XMM7,XMM2)

2604:           /* Fourth Column */
2605:           SSE_COPY_PS(XMM3,XMM6)
2606:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2607:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2608:           SSE_SUB_PS(XMM7,XMM3)
2609:         SSE_INLINE_END_2
2610:         v  += 16;
2611:       }
2612:       v    = aa + ai16;
2613:       ai16 = 16*diag[--i];
2614:       PREFETCH_NTA(aa+ai16+16);
2615:       /* 
2616:          Scale the result by the diagonal 4x4 block, 
2617:          which was inverted as part of the factorization
2618:       */
2619:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2620:         /* First Column */
2621:         SSE_COPY_PS(XMM0,XMM7)
2622:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2623:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2625:         /* Second Column */
2626:         SSE_COPY_PS(XMM1,XMM7)
2627:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2628:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2629:         SSE_ADD_PS(XMM0,XMM1)

2631:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2632: 
2633:         /* Third Column */
2634:         SSE_COPY_PS(XMM2,XMM7)
2635:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2636:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2637:         SSE_ADD_PS(XMM0,XMM2)

2639:         /* Fourth Column */
2640:         SSE_COPY_PS(XMM3,XMM7)
2641:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2642:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2643:         SSE_ADD_PS(XMM0,XMM3)

2645:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2646:       SSE_INLINE_END_3

2648:       v    = aa + ai16 + 16;
2649:       idt -= 4;
2650:     }

2652:     /* Convert t from single precision back to double precision (inplace)*/
2653:     idt = 4*(n-1);
2654:     for (i=n-1;i>=0;i--) {
2655:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2656:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2657:       PetscScalar *xtemp=&x[idt];
2658:       MatScalar   *ttemp=&t[idt];
2659:       xtemp[3] = (PetscScalar)ttemp[3];
2660:       xtemp[2] = (PetscScalar)ttemp[2];
2661:       xtemp[1] = (PetscScalar)ttemp[1];
2662:       xtemp[0] = (PetscScalar)ttemp[0];
2663:       idt -= 4;
2664:     }

2666:   } /* End of artificial scope. */
2667:   VecRestoreArray(bb,&b);
2668:   VecRestoreArray(xx,&x);
2669:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2670:   SSE_SCOPE_END;
2671:   return(0);
2672: }

2674: #endif
2677: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2678: {
2679:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2680:   IS             iscol=a->col,isrow=a->row;
2682:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2683:   PetscInt       *diag = a->diag;
2684:   MatScalar      *aa=a->a,*v;
2685:   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3,*t;

2688:   VecGetArray(bb,&b);
2689:   VecGetArray(xx,&x);
2690:   t  = a->solve_work;

2692:   ISGetIndices(isrow,&rout); r = rout;
2693:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2695:   /* forward solve the lower triangular */
2696:   idx    = 3*(*r++);
2697:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2698:   for (i=1; i<n; i++) {
2699:     v     = aa + 9*ai[i];
2700:     vi    = aj + ai[i];
2701:     nz    = diag[i] - ai[i];
2702:     idx   = 3*(*r++);
2703:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2704:     while (nz--) {
2705:       idx   = 3*(*vi++);
2706:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2707:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2708:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2709:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2710:       v += 9;
2711:     }
2712:     idx = 3*i;
2713:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2714:   }
2715:   /* backward solve the upper triangular */
2716:   for (i=n-1; i>=0; i--){
2717:     v    = aa + 9*diag[i] + 9;
2718:     vi   = aj + diag[i] + 1;
2719:     nz   = ai[i+1] - diag[i] - 1;
2720:     idt  = 3*i;
2721:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2722:     while (nz--) {
2723:       idx   = 3*(*vi++);
2724:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2725:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2726:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2727:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2728:       v += 9;
2729:     }
2730:     idc = 3*(*c--);
2731:     v   = aa + 9*diag[i];
2732:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2733:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2734:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2735:   }
2736:   ISRestoreIndices(isrow,&rout);
2737:   ISRestoreIndices(iscol,&cout);
2738:   VecRestoreArray(bb,&b);
2739:   VecRestoreArray(xx,&x);
2740:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2741:   return(0);
2742: }

2744: /*
2745:       Special case where the matrix was ILU(0) factored in the natural
2746:    ordering. This eliminates the need for the column and row permutation.
2747: */
2750: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2751: {
2752:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2753:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2755:   PetscInt       *diag = a->diag;
2756:   MatScalar      *aa=a->a,*v;
2757:   PetscScalar    *x,*b,s1,s2,s3,x1,x2,x3;
2758:   PetscInt       jdx,idt,idx,nz,*vi,i;

2761:   VecGetArray(bb,&b);
2762:   VecGetArray(xx,&x);


2765:   /* forward solve the lower triangular */
2766:   idx    = 0;
2767:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2768:   for (i=1; i<n; i++) {
2769:     v     =  aa      + 9*ai[i];
2770:     vi    =  aj      + ai[i];
2771:     nz    =  diag[i] - ai[i];
2772:     idx   +=  3;
2773:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2774:     while (nz--) {
2775:       jdx   = 3*(*vi++);
2776:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2777:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2778:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2779:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2780:       v    += 9;
2781:     }
2782:     x[idx]   = s1;
2783:     x[1+idx] = s2;
2784:     x[2+idx] = s3;
2785:   }
2786:   /* backward solve the upper triangular */
2787:   for (i=n-1; i>=0; i--){
2788:     v    = aa + 9*diag[i] + 9;
2789:     vi   = aj + diag[i] + 1;
2790:     nz   = ai[i+1] - diag[i] - 1;
2791:     idt  = 3*i;
2792:     s1 = x[idt];  s2 = x[1+idt];
2793:     s3 = x[2+idt];
2794:     while (nz--) {
2795:       idx   = 3*(*vi++);
2796:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2797:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2798:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2799:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2800:       v    += 9;
2801:     }
2802:     v        = aa +  9*diag[i];
2803:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2804:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2805:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2806:   }

2808:   VecRestoreArray(bb,&b);
2809:   VecRestoreArray(xx,&x);
2810:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2811:   return(0);
2812: }

2816: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2817: {
2818:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2819:   IS             iscol=a->col,isrow=a->row;
2821:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2822:   PetscInt       *diag = a->diag;
2823:   MatScalar      *aa=a->a,*v;
2824:   PetscScalar    *x,*b,s1,s2,x1,x2,*t;

2827:   VecGetArray(bb,&b);
2828:   VecGetArray(xx,&x);
2829:   t  = a->solve_work;

2831:   ISGetIndices(isrow,&rout); r = rout;
2832:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2834:   /* forward solve the lower triangular */
2835:   idx    = 2*(*r++);
2836:   t[0] = b[idx]; t[1] = b[1+idx];
2837:   for (i=1; i<n; i++) {
2838:     v     = aa + 4*ai[i];
2839:     vi    = aj + ai[i];
2840:     nz    = diag[i] - ai[i];
2841:     idx   = 2*(*r++);
2842:     s1  = b[idx]; s2 = b[1+idx];
2843:     while (nz--) {
2844:       idx   = 2*(*vi++);
2845:       x1    = t[idx]; x2 = t[1+idx];
2846:       s1 -= v[0]*x1 + v[2]*x2;
2847:       s2 -= v[1]*x1 + v[3]*x2;
2848:       v += 4;
2849:     }
2850:     idx = 2*i;
2851:     t[idx] = s1; t[1+idx] = s2;
2852:   }
2853:   /* backward solve the upper triangular */
2854:   for (i=n-1; i>=0; i--){
2855:     v    = aa + 4*diag[i] + 4;
2856:     vi   = aj + diag[i] + 1;
2857:     nz   = ai[i+1] - diag[i] - 1;
2858:     idt  = 2*i;
2859:     s1 = t[idt]; s2 = t[1+idt];
2860:     while (nz--) {
2861:       idx   = 2*(*vi++);
2862:       x1    = t[idx]; x2 = t[1+idx];
2863:       s1 -= v[0]*x1 + v[2]*x2;
2864:       s2 -= v[1]*x1 + v[3]*x2;
2865:       v += 4;
2866:     }
2867:     idc = 2*(*c--);
2868:     v   = aa + 4*diag[i];
2869:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2870:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2871:   }
2872:   ISRestoreIndices(isrow,&rout);
2873:   ISRestoreIndices(iscol,&cout);
2874:   VecRestoreArray(bb,&b);
2875:   VecRestoreArray(xx,&x);
2876:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2877:   return(0);
2878: }

2880: /*
2881:       Special case where the matrix was ILU(0) factored in the natural
2882:    ordering. This eliminates the need for the column and row permutation.
2883: */
2886: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2887: {
2888:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2889:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2891:   PetscInt       *diag = a->diag;
2892:   MatScalar      *aa=a->a,*v;
2893:   PetscScalar    *x,*b,s1,s2,x1,x2;
2894:   PetscInt       jdx,idt,idx,nz,*vi,i;

2897:   VecGetArray(bb,&b);
2898:   VecGetArray(xx,&x);

2900:   /* forward solve the lower triangular */
2901:   idx    = 0;
2902:   x[0]   = b[0]; x[1] = b[1];
2903:   for (i=1; i<n; i++) {
2904:     v     =  aa      + 4*ai[i];
2905:     vi    =  aj      + ai[i];
2906:     nz    =  diag[i] - ai[i];
2907:     idx   +=  2;
2908:     s1  =  b[idx];s2 = b[1+idx];
2909:     while (nz--) {
2910:       jdx   = 2*(*vi++);
2911:       x1    = x[jdx];x2 = x[1+jdx];
2912:       s1 -= v[0]*x1 + v[2]*x2;
2913:       s2 -= v[1]*x1 + v[3]*x2;
2914:       v    += 4;
2915:     }
2916:     x[idx]   = s1;
2917:     x[1+idx] = s2;
2918:   }
2919:   /* backward solve the upper triangular */
2920:   for (i=n-1; i>=0; i--){
2921:     v    = aa + 4*diag[i] + 4;
2922:     vi   = aj + diag[i] + 1;
2923:     nz   = ai[i+1] - diag[i] - 1;
2924:     idt  = 2*i;
2925:     s1 = x[idt];  s2 = x[1+idt];
2926:     while (nz--) {
2927:       idx   = 2*(*vi++);
2928:       x1    = x[idx];   x2 = x[1+idx];
2929:       s1 -= v[0]*x1 + v[2]*x2;
2930:       s2 -= v[1]*x1 + v[3]*x2;
2931:       v    += 4;
2932:     }
2933:     v        = aa +  4*diag[i];
2934:     x[idt]   = v[0]*s1 + v[2]*s2;
2935:     x[1+idt] = v[1]*s1 + v[3]*s2;
2936:   }

2938:   VecRestoreArray(bb,&b);
2939:   VecRestoreArray(xx,&x);
2940:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2941:   return(0);
2942: }

2946: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2947: {
2948:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2949:   IS             iscol=a->col,isrow=a->row;
2951:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2952:   PetscInt       *diag = a->diag;
2953:   MatScalar      *aa=a->a,*v;
2954:   PetscScalar    *x,*b,s1,*t;

2957:   if (!n) return(0);

2959:   VecGetArray(bb,&b);
2960:   VecGetArray(xx,&x);
2961:   t  = a->solve_work;

2963:   ISGetIndices(isrow,&rout); r = rout;
2964:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2966:   /* forward solve the lower triangular */
2967:   t[0] = b[*r++];
2968:   for (i=1; i<n; i++) {
2969:     v     = aa + ai[i];
2970:     vi    = aj + ai[i];
2971:     nz    = diag[i] - ai[i];
2972:     s1  = b[*r++];
2973:     while (nz--) {
2974:       s1 -= (*v++)*t[*vi++];
2975:     }
2976:     t[i] = s1;
2977:   }
2978:   /* backward solve the upper triangular */
2979:   for (i=n-1; i>=0; i--){
2980:     v    = aa + diag[i] + 1;
2981:     vi   = aj + diag[i] + 1;
2982:     nz   = ai[i+1] - diag[i] - 1;
2983:     s1 = t[i];
2984:     while (nz--) {
2985:       s1 -= (*v++)*t[*vi++];
2986:     }
2987:     x[*c--] = t[i] = aa[diag[i]]*s1;
2988:   }

2990:   ISRestoreIndices(isrow,&rout);
2991:   ISRestoreIndices(iscol,&cout);
2992:   VecRestoreArray(bb,&b);
2993:   VecRestoreArray(xx,&x);
2994:   PetscLogFlops(2*1*(a->nz) - A->n);
2995:   return(0);
2996: }
2997: /*
2998:       Special case where the matrix was ILU(0) factored in the natural
2999:    ordering. This eliminates the need for the column and row permutation.
3000: */
3003: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3004: {
3005:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3006:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3008:   PetscInt       *diag = a->diag;
3009:   MatScalar      *aa=a->a;
3010:   PetscScalar    *x,*b;
3011:   PetscScalar    s1,x1;
3012:   MatScalar      *v;
3013:   PetscInt       jdx,idt,idx,nz,*vi,i;

3016:   VecGetArray(bb,&b);
3017:   VecGetArray(xx,&x);

3019:   /* forward solve the lower triangular */
3020:   idx    = 0;
3021:   x[0]   = b[0];
3022:   for (i=1; i<n; i++) {
3023:     v     =  aa      + ai[i];
3024:     vi    =  aj      + ai[i];
3025:     nz    =  diag[i] - ai[i];
3026:     idx   +=  1;
3027:     s1  =  b[idx];
3028:     while (nz--) {
3029:       jdx   = *vi++;
3030:       x1    = x[jdx];
3031:       s1 -= v[0]*x1;
3032:       v    += 1;
3033:     }
3034:     x[idx]   = s1;
3035:   }
3036:   /* backward solve the upper triangular */
3037:   for (i=n-1; i>=0; i--){
3038:     v    = aa + diag[i] + 1;
3039:     vi   = aj + diag[i] + 1;
3040:     nz   = ai[i+1] - diag[i] - 1;
3041:     idt  = i;
3042:     s1 = x[idt];
3043:     while (nz--) {
3044:       idx   = *vi++;
3045:       x1    = x[idx];
3046:       s1 -= v[0]*x1;
3047:       v    += 1;
3048:     }
3049:     v        = aa +  diag[i];
3050:     x[idt]   = v[0]*s1;
3051:   }
3052:   VecRestoreArray(bb,&b);
3053:   VecRestoreArray(xx,&x);
3054:   PetscLogFlops(2*(a->nz) - A->n);
3055:   return(0);
3056: }

3058: /* ----------------------------------------------------------------*/
3059: /*
3060:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3061:    except that the data structure of Mat_SeqAIJ is slightly different.
3062:    Not a good example of code reuse.
3063: */
3064: EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);

3068: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3069: {
3070:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3071:   IS             isicol;
3073:   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3074:   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3075:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3076:   PetscInt       incrlev,nnz,i,bs = A->bs,bs2 = a->bs2,levels,diagonal_fill;
3077:   PetscTruth     col_identity,row_identity;
3078:   PetscReal      f;

3081:   f             = info->fill;
3082:   levels        = (PetscInt)info->levels;
3083:   diagonal_fill = (PetscInt)info->diagonal_fill;
3084:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3085:   ISIdentity(isrow,&row_identity);
3086:   ISIdentity(iscol,&col_identity);

3088:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3089:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3090:     (*fact)->factor = FACTOR_LU;
3091:     b               = (Mat_SeqBAIJ*)(*fact)->data;
3092:     if (!b->diag) {
3093:       MatMarkDiagonal_SeqBAIJ(*fact);
3094:     }
3095:     MatMissingDiagonal_SeqBAIJ(*fact);
3096:     b->row        = isrow;
3097:     b->col        = iscol;
3098:     PetscObjectReference((PetscObject)isrow);
3099:     PetscObjectReference((PetscObject)iscol);
3100:     b->icol       = isicol;
3101:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3102:     PetscMalloc(((*fact)->m+1+(*fact)->bs)*sizeof(PetscScalar),&b->solve_work);
3103:   } else { /* general case perform the symbolic factorization */
3104:     ISGetIndices(isrow,&r);
3105:     ISGetIndices(isicol,&ic);

3107:     /* get new row pointers */
3108:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3109:     ainew[0] = 0;
3110:     /* don't know how many column pointers are needed so estimate */
3111:     jmax = (PetscInt)(f*ai[n] + 1);
3112:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3113:     /* ajfill is level of fill for each fill entry */
3114:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3115:     /* fill is a linked list of nonzeros in active row */
3116:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3117:     /* im is level for each filled value */
3118:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
3119:     /* dloc is location of diagonal in factor */
3120:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3121:     dloc[0]  = 0;
3122:     for (prow=0; prow<n; prow++) {

3124:       /* copy prow into linked list */
3125:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3126:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3127:       xi         = aj + ai[r[prow]];
3128:       fill[n]    = n;
3129:       fill[prow] = -1; /* marker for diagonal entry */
3130:       while (nz--) {
3131:         fm  = n;
3132:         idx = ic[*xi++];
3133:         do {
3134:           m  = fm;
3135:           fm = fill[m];
3136:         } while (fm < idx);
3137:         fill[m]   = idx;
3138:         fill[idx] = fm;
3139:         im[idx]   = 0;
3140:       }

3142:       /* make sure diagonal entry is included */
3143:       if (diagonal_fill && fill[prow] == -1) {
3144:         fm = n;
3145:         while (fill[fm] < prow) fm = fill[fm];
3146:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3147:         fill[fm]   = prow;
3148:         im[prow]   = 0;
3149:         nzf++;
3150:         dcount++;
3151:       }

3153:       nzi = 0;
3154:       row = fill[n];
3155:       while (row < prow) {
3156:         incrlev = im[row] + 1;
3157:         nz      = dloc[row];
3158:         xi      = ajnew  + ainew[row] + nz + 1;
3159:         flev    = ajfill + ainew[row] + nz + 1;
3160:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3161:         fm      = row;
3162:         while (nnz-- > 0) {
3163:           idx = *xi++;
3164:           if (*flev + incrlev > levels) {
3165:             flev++;
3166:             continue;
3167:           }
3168:           do {
3169:             m  = fm;
3170:             fm = fill[m];
3171:           } while (fm < idx);
3172:           if (fm != idx) {
3173:             im[idx]   = *flev + incrlev;
3174:             fill[m]   = idx;
3175:             fill[idx] = fm;
3176:             fm        = idx;
3177:             nzf++;
3178:           } else {
3179:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3180:           }
3181:           flev++;
3182:         }
3183:         row = fill[row];
3184:         nzi++;
3185:       }
3186:       /* copy new filled row into permanent storage */
3187:       ainew[prow+1] = ainew[prow] + nzf;
3188:       if (ainew[prow+1] > jmax) {

3190:         /* estimate how much additional space we will need */
3191:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3192:         /* just double the memory each time */
3193:         PetscInt maxadd = jmax;
3194:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3195:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3196:         jmax += maxadd;

3198:         /* allocate a longer ajnew and ajfill */
3199:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3200:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3201:         PetscFree(ajnew);
3202:         ajnew = xi;
3203:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3204:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3205:         PetscFree(ajfill);
3206:         ajfill = xi;
3207:         reallocate++; /* count how many reallocations are needed */
3208:       }
3209:       xi          = ajnew + ainew[prow];
3210:       flev        = ajfill + ainew[prow];
3211:       dloc[prow]  = nzi;
3212:       fm          = fill[n];
3213:       while (nzf--) {
3214:         *xi++   = fm;
3215:         *flev++ = im[fm];
3216:         fm      = fill[fm];
3217:       }
3218:       /* make sure row has diagonal entry */
3219:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3220:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3221:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3222:       }
3223:     }
3224:     PetscFree(ajfill);
3225:     ISRestoreIndices(isrow,&r);
3226:     ISRestoreIndices(isicol,&ic);
3227:     PetscFree(fill);
3228:     PetscFree(im);

3230:     {
3231:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3232:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocate,f,af);
3233:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3234:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3235:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3236:       if (diagonal_fill) {
3237:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %D missing diagonals",dcount);
3238:       }
3239:     }

3241:     /* put together the new matrix */
3242:     MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);
3243:     MatSetType(*fact,A->type_name);
3244:     MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);
3245:     PetscLogObjectParent(*fact,isicol);
3246:     b = (Mat_SeqBAIJ*)(*fact)->data;
3247:     PetscFree(b->imax);
3248:     b->singlemalloc = PETSC_FALSE;
3249:     len = bs2*ainew[n]*sizeof(MatScalar);
3250:     /* the next line frees the default space generated by the Create() */
3251:     PetscFree(b->a);
3252:     PetscFree(b->ilen);
3253:     PetscMalloc(len,&b->a);
3254:     b->j          = ajnew;
3255:     b->i          = ainew;
3256:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3257:     b->diag       = dloc;
3258:     b->ilen       = 0;
3259:     b->imax       = 0;
3260:     b->row        = isrow;
3261:     b->col        = iscol;
3262:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263:     PetscObjectReference((PetscObject)isrow);
3264:     PetscObjectReference((PetscObject)iscol);
3265:     b->icol       = isicol;
3266:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3267:     /* In b structure:  Free imax, ilen, old a, old j.  
3268:        Allocate dloc, solve_work, new a, new j */
3269:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3270:     b->maxnz          = b->nz = ainew[n];
3271:     (*fact)->factor   = FACTOR_LU;

3273:     (*fact)->info.factor_mallocs    = reallocate;
3274:     (*fact)->info.fill_ratio_given  = f;
3275:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276:   }

3278:   if (row_identity && col_identity) {
3279:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3280:   }
3281:   return(0);
3282: }

3286: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3287: {
3288:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3289:   /* int i,*AJ=a->j,nz=a->nz; */
3291:   /* Undo Column scaling */
3292: /*    while (nz--) { */
3293: /*      AJ[i] = AJ[i]/4; */
3294: /*    } */
3295:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3296:   A->ops->setunfactored = PETSC_NULL;
3297:   return(0);
3298: }

3302: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3303: {
3304:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3305:   PetscInt       *AJ=a->j,nz=a->nz;
3306:   unsigned short *aj=(unsigned short *)AJ;
3308:   /* Is this really necessary? */
3309:   while (nz--) {
3310:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3311:   }
3312:   A->ops->setunfactored = PETSC_NULL;
3313:   return(0);
3314: }

3318: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3319: {
3320:   /*
3321:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3322:       with natural ordering
3323:   */
3325:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3326:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3327:   switch (inA->bs) {
3328:   case 1:
3329:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3330:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3331:     break;
3332:   case 2:
3333:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3334:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3335:     break;
3336:   case 3:
3337:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3338:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3339:     break;
3340:   case 4:
3341: #if defined(PETSC_USE_MAT_SINGLE)
3342:     {
3343:       PetscTruth  sse_enabled_local;
3345:       PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3346:       if (sse_enabled_local) {
3347: #  if defined(PETSC_HAVE_SSE)
3348:         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3349:         if (n==(unsigned short)n) {
3350:           unsigned short *aj=(unsigned short *)AJ;
3351:           for (i=0;i<nz;i++) {
3352:             aj[i] = (unsigned short)AJ[i];
3353:           }
3354:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3355:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3356:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3357:         } else {
3358:         /* Scale the column indices for easier indexing in MatSolve. */
3359: /*            for (i=0;i<nz;i++) { */
3360: /*              AJ[i] = AJ[i]*4; */
3361: /*            } */
3362:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3363:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3364:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3365:         }
3366: #  else
3367:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3368:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3369: #  endif
3370:       } else {
3371:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3372:         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3373:       }
3374:     }
3375: #else
3376:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3377:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3378: #endif
3379:     break;
3380:   case 5:
3381:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3382:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3383:     break;
3384:   case 6:
3385:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3386:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3387:     break;
3388:   case 7:
3389:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3390:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3391:     break;
3392:   }
3393:   return(0);
3394: }

3398: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3399: {
3400:   /*
3401:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3402:       with natural ordering
3403:   */
3404:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
3405:   IS             row = a->row, col = a->col;
3406:   PetscTruth     row_identity, col_identity;
3407:   PetscTruth     use_natural;


3412:   use_natural = PETSC_FALSE;
3413:   if (row && col) {
3414:     ISIdentity(row,&row_identity);
3415:     ISIdentity(col,&col_identity);

3417:     if (row_identity && col_identity) {
3418:       use_natural = PETSC_TRUE;
3419:     }
3420:   } else {
3421:     use_natural = PETSC_TRUE;
3422:   }

3424:   switch (A->bs) {
3425:   case 1:
3426:     if (use_natural) {
3427:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3428:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3429:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3430:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3431:     } else {
3432:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3433:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3434:     }
3435:     break;
3436:   case 2:
3437:     if (use_natural) {
3438:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3439:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3440:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3441:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3442:     } else {
3443:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3444:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3445:     }
3446:     break;
3447:   case 3:
3448:     if (use_natural) {
3449:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3450:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3451:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3452:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3453:     } else {
3454:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3455:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3456:     }
3457:     break;
3458:   case 4:
3459:     {
3460:       PetscTruth sse_enabled_local;
3461:       PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3462:       if (use_natural) {
3463: #if defined(PETSC_USE_MAT_SINGLE)
3464:         if (sse_enabled_local) { /* Natural + Single + SSE */
3465: #  if defined(PETSC_HAVE_SSE)
3466:           PetscInt n=a->mbs;
3467:           if (n==(unsigned short)n) {
3468:             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3469:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3470:           } else {
3471:             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3472:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3473:           }
3474: #  else
3475:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3476:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3477: #  endif
3478:         } else { /* Natural + Single */
3479:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3480:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3481:         }
3482: #else
3483:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3484:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3485: #endif
3486:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3487:         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3488:       } else { /* Arbitrary ordering */
3489: #if defined(PETSC_USE_MAT_SINGLE)
3490:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3491: #  if defined(PETSC_HAVE_SSE)
3492:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3493:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3494: #  else
3495:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3496:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3497: #  endif
3498:         } else { /* Arbitrary + Single */
3499:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3500:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3501:         }
3502: #else
3503:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3504: #endif
3505:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3506:       }
3507:     }
3508:     break;
3509:   case 5:
3510:     if (use_natural) {
3511:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3512:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3513:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3514:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3515:     } else {
3516:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3517:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3518:     }
3519:     break;
3520:   case 6:
3521:     if (use_natural) {
3522:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3523:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3524:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3525:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3526:     } else {
3527:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3528:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3529:     }
3530:     break;
3531:   case 7:
3532:     if (use_natural) {
3533:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3534:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3535:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3536:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3537:     } else {
3538:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3539:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3540:     }
3541:     break;
3542:   default:
3543:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3544:     break;
3545:   }
3546:   return(0);
3547: }

3551: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3552: {

3556:   MatSeqBAIJ_UpdateSolvers(A);
3557:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3558:     (*A->ops->solve)(A,x,y);
3559:   } else {
3560:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3561:   }
3562:   return(0);
3563: }

3567: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3568: {

3572:   MatSeqBAIJ_UpdateSolvers(A);
3573:   (*A->ops->solvetranspose)(A,x,y);
3574:   return(0);
3575: }