Actual source code: baijfact2.c

  1: /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
  2: /*
  3:     Factorization code for BAIJ format. 
  4: */

 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/vec/vecimpl.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 13: int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 16:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 17:   int             *diag = a->diag;
 18:   MatScalar       *aa=a->a,*v;
 19:   PetscScalar     s1,*x,*b;

 22:   VecCopy(bb,xx);
 23:   VecGetArray(bb,&b);
 24:   VecGetArray(xx,&x);
 25: 
 26:   /* forward solve the U^T */
 27:   for (i=0; i<n; i++) {

 29:     v     = aa + diag[i];
 30:     /* multiply by the inverse of the block diagonal */
 31:     s1    = (*v++)*x[i];
 32:     vi    = aj + diag[i] + 1;
 33:     nz    = ai[i+1] - diag[i] - 1;
 34:     while (nz--) {
 35:       x[*vi++]  -= (*v++)*s1;
 36:     }
 37:     x[i]   = s1;
 38:   }
 39:   /* backward solve the L^T */
 40:   for (i=n-1; i>=0; i--){
 41:     v    = aa + diag[i] - 1;
 42:     vi   = aj + diag[i] - 1;
 43:     nz   = diag[i] - ai[i];
 44:     s1   = x[i];
 45:     while (nz--) {
 46:       x[*vi--]   -=  (*v--)*s1;
 47:     }
 48:   }
 49:   VecRestoreArray(bb,&b);
 50:   VecRestoreArray(xx,&x);
 51:   PetscLogFlops(2*(a->nz) - A->n);
 52:   return(0);
 53: }

 57: int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 58: {
 59:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
 60:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 61:   int             *diag = a->diag,oidx;
 62:   MatScalar       *aa=a->a,*v;
 63:   PetscScalar     s1,s2,x1,x2;
 64:   PetscScalar     *x,*b;

 67:   VecCopy(bb,xx);
 68:   VecGetArray(bb,&b);
 69:   VecGetArray(xx,&x);

 71:   /* forward solve the U^T */
 72:   idx = 0;
 73:   for (i=0; i<n; i++) {

 75:     v     = aa + 4*diag[i];
 76:     /* multiply by the inverse of the block diagonal */
 77:     x1 = x[idx];   x2 = x[1+idx];
 78:     s1 = v[0]*x1  +  v[1]*x2;
 79:     s2 = v[2]*x1  +  v[3]*x2;
 80:     v += 4;

 82:     vi    = aj + diag[i] + 1;
 83:     nz    = ai[i+1] - diag[i] - 1;
 84:     while (nz--) {
 85:       oidx = 2*(*vi++);
 86:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 87:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 88:       v  += 4;
 89:     }
 90:     x[idx]   = s1;x[1+idx] = s2;
 91:     idx += 2;
 92:   }
 93:   /* backward solve the L^T */
 94:   for (i=n-1; i>=0; i--){
 95:     v    = aa + 4*diag[i] - 4;
 96:     vi   = aj + diag[i] - 1;
 97:     nz   = diag[i] - ai[i];
 98:     idt  = 2*i;
 99:     s1   = x[idt];  s2 = x[1+idt];
100:     while (nz--) {
101:       idx   = 2*(*vi--);
102:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104:       v -= 4;
105:     }
106:   }
107:   VecRestoreArray(bb,&b);
108:   VecRestoreArray(xx,&x);
109:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110:   return(0);
111: }

115: int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116: {
117:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119:   int             *diag = a->diag,oidx;
120:   MatScalar       *aa=a->a,*v;
121:   PetscScalar     s1,s2,s3,x1,x2,x3;
122:   PetscScalar     *x,*b;

125:   VecCopy(bb,xx);
126:   VecGetArray(bb,&b);
127:   VecGetArray(xx,&x);

129:   /* forward solve the U^T */
130:   idx = 0;
131:   for (i=0; i<n; i++) {

133:     v     = aa + 9*diag[i];
134:     /* multiply by the inverse of the block diagonal */
135:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139:     v += 9;

141:     vi    = aj + diag[i] + 1;
142:     nz    = ai[i+1] - diag[i] - 1;
143:     while (nz--) {
144:       oidx = 3*(*vi++);
145:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148:       v  += 9;
149:     }
150:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151:     idx += 3;
152:   }
153:   /* backward solve the L^T */
154:   for (i=n-1; i>=0; i--){
155:     v    = aa + 9*diag[i] - 9;
156:     vi   = aj + diag[i] - 1;
157:     nz   = diag[i] - ai[i];
158:     idt  = 3*i;
159:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160:     while (nz--) {
161:       idx   = 3*(*vi--);
162:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165:       v -= 9;
166:     }
167:   }
168:   VecRestoreArray(bb,&b);
169:   VecRestoreArray(xx,&x);
170:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171:   return(0);
172: }

176: int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177: {
178:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180:   int             *diag = a->diag,oidx;
181:   MatScalar       *aa=a->a,*v;
182:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
183:   PetscScalar     *x,*b;

186:   VecCopy(bb,xx);
187:   VecGetArray(bb,&b);
188:   VecGetArray(xx,&x);

190:   /* forward solve the U^T */
191:   idx = 0;
192:   for (i=0; i<n; i++) {

194:     v     = aa + 16*diag[i];
195:     /* multiply by the inverse of the block diagonal */
196:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201:     v += 16;

203:     vi    = aj + diag[i] + 1;
204:     nz    = ai[i+1] - diag[i] - 1;
205:     while (nz--) {
206:       oidx = 4*(*vi++);
207:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211:       v  += 16;
212:     }
213:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214:     idx += 4;
215:   }
216:   /* backward solve the L^T */
217:   for (i=n-1; i>=0; i--){
218:     v    = aa + 16*diag[i] - 16;
219:     vi   = aj + diag[i] - 1;
220:     nz   = diag[i] - ai[i];
221:     idt  = 4*i;
222:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223:     while (nz--) {
224:       idx   = 4*(*vi--);
225:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229:       v -= 16;
230:     }
231:   }
232:   VecRestoreArray(bb,&b);
233:   VecRestoreArray(xx,&x);
234:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235:   return(0);
236: }

240: int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241: {
242:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244:   int             *diag = a->diag,oidx;
245:   MatScalar       *aa=a->a,*v;
246:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
247:   PetscScalar     *x,*b;

250:   VecCopy(bb,xx);
251:   VecGetArray(bb,&b);
252:   VecGetArray(xx,&x);

254:   /* forward solve the U^T */
255:   idx = 0;
256:   for (i=0; i<n; i++) {

258:     v     = aa + 25*diag[i];
259:     /* multiply by the inverse of the block diagonal */
260:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266:     v += 25;

268:     vi    = aj + diag[i] + 1;
269:     nz    = ai[i+1] - diag[i] - 1;
270:     while (nz--) {
271:       oidx = 5*(*vi++);
272:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277:       v  += 25;
278:     }
279:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280:     idx += 5;
281:   }
282:   /* backward solve the L^T */
283:   for (i=n-1; i>=0; i--){
284:     v    = aa + 25*diag[i] - 25;
285:     vi   = aj + diag[i] - 1;
286:     nz   = diag[i] - ai[i];
287:     idt  = 5*i;
288:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289:     while (nz--) {
290:       idx   = 5*(*vi--);
291:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296:       v -= 25;
297:     }
298:   }
299:   VecRestoreArray(bb,&b);
300:   VecRestoreArray(xx,&x);
301:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302:   return(0);
303: }

307: int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308: {
309:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311:   int             *diag = a->diag,oidx;
312:   MatScalar       *aa=a->a,*v;
313:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
314:   PetscScalar     *x,*b;

317:   VecCopy(bb,xx);
318:   VecGetArray(bb,&b);
319:   VecGetArray(xx,&x);

321:   /* forward solve the U^T */
322:   idx = 0;
323:   for (i=0; i<n; i++) {

325:     v     = aa + 36*diag[i];
326:     /* multiply by the inverse of the block diagonal */
327:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328:     x6    = x[5+idx];
329:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335:     v += 36;

337:     vi    = aj + diag[i] + 1;
338:     nz    = ai[i+1] - diag[i] - 1;
339:     while (nz--) {
340:       oidx = 6*(*vi++);
341:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347:       v  += 36;
348:     }
349:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350:     x[5+idx] = s6;
351:     idx += 6;
352:   }
353:   /* backward solve the L^T */
354:   for (i=n-1; i>=0; i--){
355:     v    = aa + 36*diag[i] - 36;
356:     vi   = aj + diag[i] - 1;
357:     nz   = diag[i] - ai[i];
358:     idt  = 6*i;
359:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360:     s6 = x[5+idt];
361:     while (nz--) {
362:       idx   = 6*(*vi--);
363:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369:       v -= 36;
370:     }
371:   }
372:   VecRestoreArray(bb,&b);
373:   VecRestoreArray(xx,&x);
374:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375:   return(0);
376: }

380: int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381: {
382:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383:   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384:   int             *diag = a->diag,oidx;
385:   MatScalar       *aa=a->a,*v;
386:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
387:   PetscScalar     *x,*b;

390:   VecCopy(bb,xx);
391:   VecGetArray(bb,&b);
392:   VecGetArray(xx,&x);

394:   /* forward solve the U^T */
395:   idx = 0;
396:   for (i=0; i<n; i++) {

398:     v     = aa + 49*diag[i];
399:     /* multiply by the inverse of the block diagonal */
400:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401:     x6    = x[5+idx]; x7 = x[6+idx];
402:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409:     v += 49;

411:     vi    = aj + diag[i] + 1;
412:     nz    = ai[i+1] - diag[i] - 1;
413:     while (nz--) {
414:       oidx = 7*(*vi++);
415:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422:       v  += 49;
423:     }
424:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425:     x[5+idx] = s6;x[6+idx] = s7;
426:     idx += 7;
427:   }
428:   /* backward solve the L^T */
429:   for (i=n-1; i>=0; i--){
430:     v    = aa + 49*diag[i] - 49;
431:     vi   = aj + diag[i] - 1;
432:     nz   = diag[i] - ai[i];
433:     idt  = 7*i;
434:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435:     s6 = x[5+idt];s7 = x[6+idt];
436:     while (nz--) {
437:       idx   = 7*(*vi--);
438:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445:       v -= 49;
446:     }
447:   }
448:   VecRestoreArray(bb,&b);
449:   VecRestoreArray(xx,&x);
450:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451:   return(0);
452: }

454: /*---------------------------------------------------------------------------------------------*/
457: int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458: {
459:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460:   IS              iscol=a->col,isrow=a->row;
461:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462:   int             *diag = a->diag;
463:   MatScalar       *aa=a->a,*v;
464:   PetscScalar     s1,*x,*b,*t;

467:   VecGetArray(bb,&b);
468:   VecGetArray(xx,&x);
469:   t  = a->solve_work;

471:   ISGetIndices(isrow,&rout); r = rout;
472:   ISGetIndices(iscol,&cout); c = cout;

474:   /* copy the b into temp work space according to permutation */
475:   for (i=0; i<n; i++) {
476:     t[i] = b[c[i]];
477:   }

479:   /* forward solve the U^T */
480:   for (i=0; i<n; i++) {

482:     v     = aa + diag[i];
483:     /* multiply by the inverse of the block diagonal */
484:     s1    = (*v++)*t[i];
485:     vi    = aj + diag[i] + 1;
486:     nz    = ai[i+1] - diag[i] - 1;
487:     while (nz--) {
488:       t[*vi++]  -= (*v++)*s1;
489:     }
490:     t[i]   = s1;
491:   }
492:   /* backward solve the L^T */
493:   for (i=n-1; i>=0; i--){
494:     v    = aa + diag[i] - 1;
495:     vi   = aj + diag[i] - 1;
496:     nz   = diag[i] - ai[i];
497:     s1   = t[i];
498:     while (nz--) {
499:       t[*vi--]   -=  (*v--)*s1;
500:     }
501:   }

503:   /* copy t into x according to permutation */
504:   for (i=0; i<n; i++) {
505:     x[r[i]]   = t[i];
506:   }

508:   ISRestoreIndices(isrow,&rout);
509:   ISRestoreIndices(iscol,&cout);
510:   VecRestoreArray(bb,&b);
511:   VecRestoreArray(xx,&x);
512:   PetscLogFlops(2*(a->nz) - A->n);
513:   return(0);
514: }

518: int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519: {
520:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521:   IS              iscol=a->col,isrow=a->row;
522:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523:   int             *diag = a->diag,ii,ic,ir,oidx;
524:   MatScalar       *aa=a->a,*v;
525:   PetscScalar     s1,s2,x1,x2;
526:   PetscScalar     *x,*b,*t;

529:   VecGetArray(bb,&b);
530:   VecGetArray(xx,&x);
531:   t  = a->solve_work;

533:   ISGetIndices(isrow,&rout); r = rout;
534:   ISGetIndices(iscol,&cout); c = cout;

536:   /* copy the b into temp work space according to permutation */
537:   ii = 0;
538:   for (i=0; i<n; i++) {
539:     ic      = 2*c[i];
540:     t[ii]   = b[ic];
541:     t[ii+1] = b[ic+1];
542:     ii += 2;
543:   }

545:   /* forward solve the U^T */
546:   idx = 0;
547:   for (i=0; i<n; i++) {

549:     v     = aa + 4*diag[i];
550:     /* multiply by the inverse of the block diagonal */
551:     x1    = t[idx];   x2 = t[1+idx];
552:     s1 = v[0]*x1  +  v[1]*x2;
553:     s2 = v[2]*x1  +  v[3]*x2;
554:     v += 4;

556:     vi    = aj + diag[i] + 1;
557:     nz    = ai[i+1] - diag[i] - 1;
558:     while (nz--) {
559:       oidx = 2*(*vi++);
560:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562:       v  += 4;
563:     }
564:     t[idx]   = s1;t[1+idx] = s2;
565:     idx += 2;
566:   }
567:   /* backward solve the L^T */
568:   for (i=n-1; i>=0; i--){
569:     v    = aa + 4*diag[i] - 4;
570:     vi   = aj + diag[i] - 1;
571:     nz   = diag[i] - ai[i];
572:     idt  = 2*i;
573:     s1 = t[idt];  s2 = t[1+idt];
574:     while (nz--) {
575:       idx   = 2*(*vi--);
576:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578:       v -= 4;
579:     }
580:   }

582:   /* copy t into x according to permutation */
583:   ii = 0;
584:   for (i=0; i<n; i++) {
585:     ir      = 2*r[i];
586:     x[ir]   = t[ii];
587:     x[ir+1] = t[ii+1];
588:     ii += 2;
589:   }

591:   ISRestoreIndices(isrow,&rout);
592:   ISRestoreIndices(iscol,&cout);
593:   VecRestoreArray(bb,&b);
594:   VecRestoreArray(xx,&x);
595:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596:   return(0);
597: }

601: int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602: {
603:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604:   IS              iscol=a->col,isrow=a->row;
605:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606:   int             *diag = a->diag,ii,ic,ir,oidx;
607:   MatScalar       *aa=a->a,*v;
608:   PetscScalar     s1,s2,s3,x1,x2,x3;
609:   PetscScalar     *x,*b,*t;

612:   VecGetArray(bb,&b);
613:   VecGetArray(xx,&x);
614:   t  = a->solve_work;

616:   ISGetIndices(isrow,&rout); r = rout;
617:   ISGetIndices(iscol,&cout); c = cout;

619:   /* copy the b into temp work space according to permutation */
620:   ii = 0;
621:   for (i=0; i<n; i++) {
622:     ic      = 3*c[i];
623:     t[ii]   = b[ic];
624:     t[ii+1] = b[ic+1];
625:     t[ii+2] = b[ic+2];
626:     ii += 3;
627:   }

629:   /* forward solve the U^T */
630:   idx = 0;
631:   for (i=0; i<n; i++) {

633:     v     = aa + 9*diag[i];
634:     /* multiply by the inverse of the block diagonal */
635:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639:     v += 9;

641:     vi    = aj + diag[i] + 1;
642:     nz    = ai[i+1] - diag[i] - 1;
643:     while (nz--) {
644:       oidx = 3*(*vi++);
645:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648:       v  += 9;
649:     }
650:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651:     idx += 3;
652:   }
653:   /* backward solve the L^T */
654:   for (i=n-1; i>=0; i--){
655:     v    = aa + 9*diag[i] - 9;
656:     vi   = aj + diag[i] - 1;
657:     nz   = diag[i] - ai[i];
658:     idt  = 3*i;
659:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660:     while (nz--) {
661:       idx   = 3*(*vi--);
662:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665:       v -= 9;
666:     }
667:   }

669:   /* copy t into x according to permutation */
670:   ii = 0;
671:   for (i=0; i<n; i++) {
672:     ir      = 3*r[i];
673:     x[ir]   = t[ii];
674:     x[ir+1] = t[ii+1];
675:     x[ir+2] = t[ii+2];
676:     ii += 3;
677:   }

679:   ISRestoreIndices(isrow,&rout);
680:   ISRestoreIndices(iscol,&cout);
681:   VecRestoreArray(bb,&b);
682:   VecRestoreArray(xx,&x);
683:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684:   return(0);
685: }

689: int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690: {
691:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692:   IS              iscol=a->col,isrow=a->row;
693:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694:   int             *diag = a->diag,ii,ic,ir,oidx;
695:   MatScalar       *aa=a->a,*v;
696:   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
697:   PetscScalar     *x,*b,*t;

700:   VecGetArray(bb,&b);
701:   VecGetArray(xx,&x);
702:   t  = a->solve_work;

704:   ISGetIndices(isrow,&rout); r = rout;
705:   ISGetIndices(iscol,&cout); c = cout;

707:   /* copy the b into temp work space according to permutation */
708:   ii = 0;
709:   for (i=0; i<n; i++) {
710:     ic      = 4*c[i];
711:     t[ii]   = b[ic];
712:     t[ii+1] = b[ic+1];
713:     t[ii+2] = b[ic+2];
714:     t[ii+3] = b[ic+3];
715:     ii += 4;
716:   }

718:   /* forward solve the U^T */
719:   idx = 0;
720:   for (i=0; i<n; i++) {

722:     v     = aa + 16*diag[i];
723:     /* multiply by the inverse of the block diagonal */
724:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729:     v += 16;

731:     vi    = aj + diag[i] + 1;
732:     nz    = ai[i+1] - diag[i] - 1;
733:     while (nz--) {
734:       oidx = 4*(*vi++);
735:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739:       v  += 16;
740:     }
741:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742:     idx += 4;
743:   }
744:   /* backward solve the L^T */
745:   for (i=n-1; i>=0; i--){
746:     v    = aa + 16*diag[i] - 16;
747:     vi   = aj + diag[i] - 1;
748:     nz   = diag[i] - ai[i];
749:     idt  = 4*i;
750:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751:     while (nz--) {
752:       idx   = 4*(*vi--);
753:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757:       v -= 16;
758:     }
759:   }

761:   /* copy t into x according to permutation */
762:   ii = 0;
763:   for (i=0; i<n; i++) {
764:     ir      = 4*r[i];
765:     x[ir]   = t[ii];
766:     x[ir+1] = t[ii+1];
767:     x[ir+2] = t[ii+2];
768:     x[ir+3] = t[ii+3];
769:     ii += 4;
770:   }

772:   ISRestoreIndices(isrow,&rout);
773:   ISRestoreIndices(iscol,&cout);
774:   VecRestoreArray(bb,&b);
775:   VecRestoreArray(xx,&x);
776:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777:   return(0);
778: }

782: int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783: {
784:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785:   IS              iscol=a->col,isrow=a->row;
786:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787:   int             *diag = a->diag,ii,ic,ir,oidx;
788:   MatScalar       *aa=a->a,*v;
789:   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
790:   PetscScalar     *x,*b,*t;

793:   VecGetArray(bb,&b);
794:   VecGetArray(xx,&x);
795:   t  = a->solve_work;

797:   ISGetIndices(isrow,&rout); r = rout;
798:   ISGetIndices(iscol,&cout); c = cout;

800:   /* copy the b into temp work space according to permutation */
801:   ii = 0;
802:   for (i=0; i<n; i++) {
803:     ic      = 5*c[i];
804:     t[ii]   = b[ic];
805:     t[ii+1] = b[ic+1];
806:     t[ii+2] = b[ic+2];
807:     t[ii+3] = b[ic+3];
808:     t[ii+4] = b[ic+4];
809:     ii += 5;
810:   }

812:   /* forward solve the U^T */
813:   idx = 0;
814:   for (i=0; i<n; i++) {

816:     v     = aa + 25*diag[i];
817:     /* multiply by the inverse of the block diagonal */
818:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824:     v += 25;

826:     vi    = aj + diag[i] + 1;
827:     nz    = ai[i+1] - diag[i] - 1;
828:     while (nz--) {
829:       oidx = 5*(*vi++);
830:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835:       v  += 25;
836:     }
837:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838:     idx += 5;
839:   }
840:   /* backward solve the L^T */
841:   for (i=n-1; i>=0; i--){
842:     v    = aa + 25*diag[i] - 25;
843:     vi   = aj + diag[i] - 1;
844:     nz   = diag[i] - ai[i];
845:     idt  = 5*i;
846:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847:     while (nz--) {
848:       idx   = 5*(*vi--);
849:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854:       v -= 25;
855:     }
856:   }

858:   /* copy t into x according to permutation */
859:   ii = 0;
860:   for (i=0; i<n; i++) {
861:     ir      = 5*r[i];
862:     x[ir]   = t[ii];
863:     x[ir+1] = t[ii+1];
864:     x[ir+2] = t[ii+2];
865:     x[ir+3] = t[ii+3];
866:     x[ir+4] = t[ii+4];
867:     ii += 5;
868:   }

870:   ISRestoreIndices(isrow,&rout);
871:   ISRestoreIndices(iscol,&cout);
872:   VecRestoreArray(bb,&b);
873:   VecRestoreArray(xx,&x);
874:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875:   return(0);
876: }

880: int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881: {
882:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883:   IS              iscol=a->col,isrow=a->row;
884:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885:   int             *diag = a->diag,ii,ic,ir,oidx;
886:   MatScalar       *aa=a->a,*v;
887:   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
888:   PetscScalar     *x,*b,*t;

891:   VecGetArray(bb,&b);
892:   VecGetArray(xx,&x);
893:   t  = a->solve_work;

895:   ISGetIndices(isrow,&rout); r = rout;
896:   ISGetIndices(iscol,&cout); c = cout;

898:   /* copy the b into temp work space according to permutation */
899:   ii = 0;
900:   for (i=0; i<n; i++) {
901:     ic      = 6*c[i];
902:     t[ii]   = b[ic];
903:     t[ii+1] = b[ic+1];
904:     t[ii+2] = b[ic+2];
905:     t[ii+3] = b[ic+3];
906:     t[ii+4] = b[ic+4];
907:     t[ii+5] = b[ic+5];
908:     ii += 6;
909:   }

911:   /* forward solve the U^T */
912:   idx = 0;
913:   for (i=0; i<n; i++) {

915:     v     = aa + 36*diag[i];
916:     /* multiply by the inverse of the block diagonal */
917:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918:     x6    = t[5+idx];
919:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925:     v += 36;

927:     vi    = aj + diag[i] + 1;
928:     nz    = ai[i+1] - diag[i] - 1;
929:     while (nz--) {
930:       oidx = 6*(*vi++);
931:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937:       v  += 36;
938:     }
939:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940:     t[5+idx] = s6;
941:     idx += 6;
942:   }
943:   /* backward solve the L^T */
944:   for (i=n-1; i>=0; i--){
945:     v    = aa + 36*diag[i] - 36;
946:     vi   = aj + diag[i] - 1;
947:     nz   = diag[i] - ai[i];
948:     idt  = 6*i;
949:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950:     s6 = t[5+idt];
951:     while (nz--) {
952:       idx   = 6*(*vi--);
953:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959:       v -= 36;
960:     }
961:   }

963:   /* copy t into x according to permutation */
964:   ii = 0;
965:   for (i=0; i<n; i++) {
966:     ir      = 6*r[i];
967:     x[ir]   = t[ii];
968:     x[ir+1] = t[ii+1];
969:     x[ir+2] = t[ii+2];
970:     x[ir+3] = t[ii+3];
971:     x[ir+4] = t[ii+4];
972:     x[ir+5] = t[ii+5];
973:     ii += 6;
974:   }

976:   ISRestoreIndices(isrow,&rout);
977:   ISRestoreIndices(iscol,&cout);
978:   VecRestoreArray(bb,&b);
979:   VecRestoreArray(xx,&x);
980:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981:   return(0);
982: }

986: int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987: {
988:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989:   IS              iscol=a->col,isrow=a->row;
990:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991:   int             *diag = a->diag,ii,ic,ir,oidx;
992:   MatScalar       *aa=a->a,*v;
993:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
994:   PetscScalar     *x,*b,*t;

997:   VecGetArray(bb,&b);
998:   VecGetArray(xx,&x);
999:   t  = a->solve_work;

1001:   ISGetIndices(isrow,&rout); r = rout;
1002:   ISGetIndices(iscol,&cout); c = cout;

1004:   /* copy the b into temp work space according to permutation */
1005:   ii = 0;
1006:   for (i=0; i<n; i++) {
1007:     ic      = 7*c[i];
1008:     t[ii]   = b[ic];
1009:     t[ii+1] = b[ic+1];
1010:     t[ii+2] = b[ic+2];
1011:     t[ii+3] = b[ic+3];
1012:     t[ii+4] = b[ic+4];
1013:     t[ii+5] = b[ic+5];
1014:     t[ii+6] = b[ic+6];
1015:     ii += 7;
1016:   }

1018:   /* forward solve the U^T */
1019:   idx = 0;
1020:   for (i=0; i<n; i++) {

1022:     v     = aa + 49*diag[i];
1023:     /* multiply by the inverse of the block diagonal */
1024:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025:     x6    = t[5+idx]; x7 = t[6+idx];
1026:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033:     v += 49;

1035:     vi    = aj + diag[i] + 1;
1036:     nz    = ai[i+1] - diag[i] - 1;
1037:     while (nz--) {
1038:       oidx = 7*(*vi++);
1039:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046:       v  += 49;
1047:     }
1048:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049:     t[5+idx] = s6;t[6+idx] = s7;
1050:     idx += 7;
1051:   }
1052:   /* backward solve the L^T */
1053:   for (i=n-1; i>=0; i--){
1054:     v    = aa + 49*diag[i] - 49;
1055:     vi   = aj + diag[i] - 1;
1056:     nz   = diag[i] - ai[i];
1057:     idt  = 7*i;
1058:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059:     s6 = t[5+idt];s7 = t[6+idt];
1060:     while (nz--) {
1061:       idx   = 7*(*vi--);
1062:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069:       v -= 49;
1070:     }
1071:   }

1073:   /* copy t into x according to permutation */
1074:   ii = 0;
1075:   for (i=0; i<n; i++) {
1076:     ir      = 7*r[i];
1077:     x[ir]   = t[ii];
1078:     x[ir+1] = t[ii+1];
1079:     x[ir+2] = t[ii+2];
1080:     x[ir+3] = t[ii+3];
1081:     x[ir+4] = t[ii+4];
1082:     x[ir+5] = t[ii+5];
1083:     x[ir+6] = t[ii+6];
1084:     ii += 7;
1085:   }

1087:   ISRestoreIndices(isrow,&rout);
1088:   ISRestoreIndices(iscol,&cout);
1089:   VecRestoreArray(bb,&b);
1090:   VecRestoreArray(xx,&x);
1091:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092:   return(0);
1093: }

1095: /* ----------------------------------------------------------- */
1098: int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1099: {
1100:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1101:   IS              iscol=a->col,isrow=a->row;
1102:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1103:   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
1104:   MatScalar       *aa=a->a,*v;
1105:   PetscScalar     *x,*b,*s,*t,*ls;

1108:   VecGetArray(bb,&b);
1109:   VecGetArray(xx,&x);
1110:   t  = a->solve_work;

1112:   ISGetIndices(isrow,&rout); r = rout;
1113:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1115:   /* forward solve the lower triangular */
1116:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1117:   for (i=1; i<n; i++) {
1118:     v   = aa + bs2*ai[i];
1119:     vi  = aj + ai[i];
1120:     nz  = a->diag[i] - ai[i];
1121:     s = t + bs*i;
1122:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1123:     while (nz--) {
1124:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1125:       v += bs2;
1126:     }
1127:   }
1128:   /* backward solve the upper triangular */
1129:   ls = a->solve_work + A->n;
1130:   for (i=n-1; i>=0; i--){
1131:     v   = aa + bs2*(a->diag[i] + 1);
1132:     vi  = aj + a->diag[i] + 1;
1133:     nz  = ai[i+1] - a->diag[i] - 1;
1134:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1135:     while (nz--) {
1136:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1137:       v += bs2;
1138:     }
1139:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1140:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1141:   }

1143:   ISRestoreIndices(isrow,&rout);
1144:   ISRestoreIndices(iscol,&cout);
1145:   VecRestoreArray(bb,&b);
1146:   VecRestoreArray(xx,&x);
1147:   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
1148:   return(0);
1149: }

1153: int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1154: {
1155:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1156:   IS              iscol=a->col,isrow=a->row;
1157:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1158:   int             *diag = a->diag;
1159:   MatScalar       *aa=a->a,*v;
1160:   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1161:   PetscScalar     *x,*b,*t;

1164:   VecGetArray(bb,&b);
1165:   VecGetArray(xx,&x);
1166:   t  = a->solve_work;

1168:   ISGetIndices(isrow,&rout); r = rout;
1169:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1171:   /* forward solve the lower triangular */
1172:   idx    = 7*(*r++);
1173:   t[0] = b[idx];   t[1] = b[1+idx];
1174:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175:   t[5] = b[5+idx]; t[6] = b[6+idx];

1177:   for (i=1; i<n; i++) {
1178:     v     = aa + 49*ai[i];
1179:     vi    = aj + ai[i];
1180:     nz    = diag[i] - ai[i];
1181:     idx   = 7*(*r++);
1182:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1184:     while (nz--) {
1185:       idx   = 7*(*vi++);
1186:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187:       x4    = t[3+idx];x5 = t[4+idx];
1188:       x6    = t[5+idx];x7 = t[6+idx];
1189:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1196:       v += 49;
1197:     }
1198:     idx = 7*i;
1199:     t[idx]   = s1;t[1+idx] = s2;
1200:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201:     t[5+idx] = s6;t[6+idx] = s7;
1202:   }
1203:   /* backward solve the upper triangular */
1204:   for (i=n-1; i>=0; i--){
1205:     v    = aa + 49*diag[i] + 49;
1206:     vi   = aj + diag[i] + 1;
1207:     nz   = ai[i+1] - diag[i] - 1;
1208:     idt  = 7*i;
1209:     s1 = t[idt];  s2 = t[1+idt];
1210:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211:     s6 = t[5+idt];s7 = t[6+idt];
1212:     while (nz--) {
1213:       idx   = 7*(*vi++);
1214:       x1    = t[idx];   x2 = t[1+idx];
1215:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216:       x6    = t[5+idx]; x7 = t[6+idx];
1217:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1224:       v += 49;
1225:     }
1226:     idc = 7*(*c--);
1227:     v   = aa + 49*diag[i];
1228:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1242:   }

1244:   ISRestoreIndices(isrow,&rout);
1245:   ISRestoreIndices(iscol,&cout);
1246:   VecRestoreArray(bb,&b);
1247:   VecRestoreArray(xx,&x);
1248:   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1249:   return(0);
1250: }

1254: int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1255: {
1256:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1257:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1258:   int             ierr,*diag = a->diag,jdx;
1259:   MatScalar       *aa=a->a,*v;
1260:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

1263:   VecGetArray(bb,&b);
1264:   VecGetArray(xx,&x);
1265:   /* forward solve the lower triangular */
1266:   idx    = 0;
1267:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1268:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1269:   x[6] = b[6+idx];
1270:   for (i=1; i<n; i++) {
1271:     v     =  aa + 49*ai[i];
1272:     vi    =  aj + ai[i];
1273:     nz    =  diag[i] - ai[i];
1274:     idx   =  7*i;
1275:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277:     s7  =  b[6+idx];
1278:     while (nz--) {
1279:       jdx   = 7*(*vi++);
1280:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1281:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1282:       x7    = x[6+jdx];
1283:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1290:       v += 49;
1291:      }
1292:     x[idx]   = s1;
1293:     x[1+idx] = s2;
1294:     x[2+idx] = s3;
1295:     x[3+idx] = s4;
1296:     x[4+idx] = s5;
1297:     x[5+idx] = s6;
1298:     x[6+idx] = s7;
1299:   }
1300:   /* backward solve the upper triangular */
1301:   for (i=n-1; i>=0; i--){
1302:     v    = aa + 49*diag[i] + 49;
1303:     vi   = aj + diag[i] + 1;
1304:     nz   = ai[i+1] - diag[i] - 1;
1305:     idt  = 7*i;
1306:     s1 = x[idt];   s2 = x[1+idt];
1307:     s3 = x[2+idt]; s4 = x[3+idt];
1308:     s5 = x[4+idt]; s6 = x[5+idt];
1309:     s7 = x[6+idt];
1310:     while (nz--) {
1311:       idx   = 7*(*vi++);
1312:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1313:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1314:       x7    = x[6+idx];
1315:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1322:       v += 49;
1323:     }
1324:     v        = aa + 49*diag[i];
1325:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1339:   }

1341:   VecRestoreArray(bb,&b);
1342:   VecRestoreArray(xx,&x);
1343:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1344:   return(0);
1345: }

1349: int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1350: {
1351:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1352:   IS              iscol=a->col,isrow=a->row;
1353:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1354:   int             *diag = a->diag;
1355:   MatScalar       *aa=a->a,*v;
1356:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;

1359:   VecGetArray(bb,&b);
1360:   VecGetArray(xx,&x);
1361:   t  = a->solve_work;

1363:   ISGetIndices(isrow,&rout); r = rout;
1364:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1366:   /* forward solve the lower triangular */
1367:   idx    = 6*(*r++);
1368:   t[0] = b[idx];   t[1] = b[1+idx];
1369:   t[2] = b[2+idx]; t[3] = b[3+idx];
1370:   t[4] = b[4+idx]; t[5] = b[5+idx];
1371:   for (i=1; i<n; i++) {
1372:     v     = aa + 36*ai[i];
1373:     vi    = aj + ai[i];
1374:     nz    = diag[i] - ai[i];
1375:     idx   = 6*(*r++);
1376:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377:     s5  = b[4+idx]; s6 = b[5+idx];
1378:     while (nz--) {
1379:       idx   = 6*(*vi++);
1380:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1388:       v += 36;
1389:     }
1390:     idx = 6*i;
1391:     t[idx]   = s1;t[1+idx] = s2;
1392:     t[2+idx] = s3;t[3+idx] = s4;
1393:     t[4+idx] = s5;t[5+idx] = s6;
1394:   }
1395:   /* backward solve the upper triangular */
1396:   for (i=n-1; i>=0; i--){
1397:     v    = aa + 36*diag[i] + 36;
1398:     vi   = aj + diag[i] + 1;
1399:     nz   = ai[i+1] - diag[i] - 1;
1400:     idt  = 6*i;
1401:     s1 = t[idt];  s2 = t[1+idt];
1402:     s3 = t[2+idt];s4 = t[3+idt];
1403:     s5 = t[4+idt];s6 = t[5+idt];
1404:     while (nz--) {
1405:       idx   = 6*(*vi++);
1406:       x1    = t[idx];   x2 = t[1+idx];
1407:       x3    = t[2+idx]; x4 = t[3+idx];
1408:       x5    = t[4+idx]; x6 = t[5+idx];
1409:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1415:       v += 36;
1416:     }
1417:     idc = 6*(*c--);
1418:     v   = aa + 36*diag[i];
1419:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1431:   }

1433:   ISRestoreIndices(isrow,&rout);
1434:   ISRestoreIndices(iscol,&cout);
1435:   VecRestoreArray(bb,&b);
1436:   VecRestoreArray(xx,&x);
1437:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1438:   return(0);
1439: }

1443: int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444: {
1445:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1446:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1447:   int             ierr,*diag = a->diag,jdx;
1448:   MatScalar       *aa=a->a,*v;
1449:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

1452:   VecGetArray(bb,&b);
1453:   VecGetArray(xx,&x);
1454:   /* forward solve the lower triangular */
1455:   idx    = 0;
1456:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1457:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1458:   for (i=1; i<n; i++) {
1459:     v     =  aa + 36*ai[i];
1460:     vi    =  aj + ai[i];
1461:     nz    =  diag[i] - ai[i];
1462:     idx   =  6*i;
1463:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1465:     while (nz--) {
1466:       jdx   = 6*(*vi++);
1467:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1468:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1475:       v += 36;
1476:      }
1477:     x[idx]   = s1;
1478:     x[1+idx] = s2;
1479:     x[2+idx] = s3;
1480:     x[3+idx] = s4;
1481:     x[4+idx] = s5;
1482:     x[5+idx] = s6;
1483:   }
1484:   /* backward solve the upper triangular */
1485:   for (i=n-1; i>=0; i--){
1486:     v    = aa + 36*diag[i] + 36;
1487:     vi   = aj + diag[i] + 1;
1488:     nz   = ai[i+1] - diag[i] - 1;
1489:     idt  = 6*i;
1490:     s1 = x[idt];   s2 = x[1+idt];
1491:     s3 = x[2+idt]; s4 = x[3+idt];
1492:     s5 = x[4+idt]; s6 = x[5+idt];
1493:     while (nz--) {
1494:       idx   = 6*(*vi++);
1495:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1496:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503:       v += 36;
1504:     }
1505:     v        = aa + 36*diag[i];
1506:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1512:   }

1514:   VecRestoreArray(bb,&b);
1515:   VecRestoreArray(xx,&x);
1516:   PetscLogFlops(2*36*(a->nz) - 6*A->n);
1517:   return(0);
1518: }

1522: int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1523: {
1524:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1525:   IS              iscol=a->col,isrow=a->row;
1526:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1527:   int             *diag = a->diag;
1528:   MatScalar       *aa=a->a,*v;
1529:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;

1532:   VecGetArray(bb,&b);
1533:   VecGetArray(xx,&x);
1534:   t  = a->solve_work;

1536:   ISGetIndices(isrow,&rout); r = rout;
1537:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1539:   /* forward solve the lower triangular */
1540:   idx    = 5*(*r++);
1541:   t[0] = b[idx];   t[1] = b[1+idx];
1542:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1543:   for (i=1; i<n; i++) {
1544:     v     = aa + 25*ai[i];
1545:     vi    = aj + ai[i];
1546:     nz    = diag[i] - ai[i];
1547:     idx   = 5*(*r++);
1548:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549:     s5  = b[4+idx];
1550:     while (nz--) {
1551:       idx   = 5*(*vi++);
1552:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553:       x4    = t[3+idx];x5 = t[4+idx];
1554:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1559:       v += 25;
1560:     }
1561:     idx = 5*i;
1562:     t[idx]   = s1;t[1+idx] = s2;
1563:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1564:   }
1565:   /* backward solve the upper triangular */
1566:   for (i=n-1; i>=0; i--){
1567:     v    = aa + 25*diag[i] + 25;
1568:     vi   = aj + diag[i] + 1;
1569:     nz   = ai[i+1] - diag[i] - 1;
1570:     idt  = 5*i;
1571:     s1 = t[idt];  s2 = t[1+idt];
1572:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1573:     while (nz--) {
1574:       idx   = 5*(*vi++);
1575:       x1    = t[idx];   x2 = t[1+idx];
1576:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582:       v += 25;
1583:     }
1584:     idc = 5*(*c--);
1585:     v   = aa + 25*diag[i];
1586:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587:                                  v[15]*s4+v[20]*s5;
1588:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589:                                  v[16]*s4+v[21]*s5;
1590:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591:                                  v[17]*s4+v[22]*s5;
1592:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593:                                  v[18]*s4+v[23]*s5;
1594:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595:                                  v[19]*s4+v[24]*s5;
1596:   }

1598:   ISRestoreIndices(isrow,&rout);
1599:   ISRestoreIndices(iscol,&cout);
1600:   VecRestoreArray(bb,&b);
1601:   VecRestoreArray(xx,&x);
1602:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1603:   return(0);
1604: }

1608: int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1609: {
1610:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1611:   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1612:   int             ierr,*diag = a->diag,jdx;
1613:   MatScalar       *aa=a->a,*v;
1614:   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;

1617:   VecGetArray(bb,&b);
1618:   VecGetArray(xx,&x);
1619:   /* forward solve the lower triangular */
1620:   idx    = 0;
1621:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1622:   for (i=1; i<n; i++) {
1623:     v     =  aa + 25*ai[i];
1624:     vi    =  aj + ai[i];
1625:     nz    =  diag[i] - ai[i];
1626:     idx   =  5*i;
1627:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1628:     while (nz--) {
1629:       jdx   = 5*(*vi++);
1630:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1636:       v    += 25;
1637:     }
1638:     x[idx]   = s1;
1639:     x[1+idx] = s2;
1640:     x[2+idx] = s3;
1641:     x[3+idx] = s4;
1642:     x[4+idx] = s5;
1643:   }
1644:   /* backward solve the upper triangular */
1645:   for (i=n-1; i>=0; i--){
1646:     v    = aa + 25*diag[i] + 25;
1647:     vi   = aj + diag[i] + 1;
1648:     nz   = ai[i+1] - diag[i] - 1;
1649:     idt  = 5*i;
1650:     s1 = x[idt];  s2 = x[1+idt];
1651:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1652:     while (nz--) {
1653:       idx   = 5*(*vi++);
1654:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1660:       v    += 25;
1661:     }
1662:     v        = aa + 25*diag[i];
1663:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1668:   }

1670:   VecRestoreArray(bb,&b);
1671:   VecRestoreArray(xx,&x);
1672:   PetscLogFlops(2*25*(a->nz) - 5*A->n);
1673:   return(0);
1674: }

1678: int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1679: {
1680:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1681:   IS              iscol=a->col,isrow=a->row;
1682:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1683:   int             *diag = a->diag;
1684:   MatScalar       *aa=a->a,*v;
1685:   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;

1688:   VecGetArray(bb,&b);
1689:   VecGetArray(xx,&x);
1690:   t  = a->solve_work;

1692:   ISGetIndices(isrow,&rout); r = rout;
1693:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1695:   /* forward solve the lower triangular */
1696:   idx    = 4*(*r++);
1697:   t[0] = b[idx];   t[1] = b[1+idx];
1698:   t[2] = b[2+idx]; t[3] = b[3+idx];
1699:   for (i=1; i<n; i++) {
1700:     v     = aa + 16*ai[i];
1701:     vi    = aj + ai[i];
1702:     nz    = diag[i] - ai[i];
1703:     idx   = 4*(*r++);
1704:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1705:     while (nz--) {
1706:       idx   = 4*(*vi++);
1707:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1712:       v    += 16;
1713:     }
1714:     idx        = 4*i;
1715:     t[idx]   = s1;t[1+idx] = s2;
1716:     t[2+idx] = s3;t[3+idx] = s4;
1717:   }
1718:   /* backward solve the upper triangular */
1719:   for (i=n-1; i>=0; i--){
1720:     v    = aa + 16*diag[i] + 16;
1721:     vi   = aj + diag[i] + 1;
1722:     nz   = ai[i+1] - diag[i] - 1;
1723:     idt  = 4*i;
1724:     s1 = t[idt];  s2 = t[1+idt];
1725:     s3 = t[2+idt];s4 = t[3+idt];
1726:     while (nz--) {
1727:       idx   = 4*(*vi++);
1728:       x1    = t[idx];   x2 = t[1+idx];
1729:       x3    = t[2+idx]; x4 = t[3+idx];
1730:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1734:       v += 16;
1735:     }
1736:     idc      = 4*(*c--);
1737:     v        = aa + 16*diag[i];
1738:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1742:   }

1744:   ISRestoreIndices(isrow,&rout);
1745:   ISRestoreIndices(iscol,&cout);
1746:   VecRestoreArray(bb,&b);
1747:   VecRestoreArray(xx,&x);
1748:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1749:   return(0);
1750: }

1754: int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755: {
1756:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757:   IS              iscol=a->col,isrow=a->row;
1758:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759:   int             *diag = a->diag;
1760:   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761:   PetscScalar     *x,*b;

1764:   VecGetArray(bb,&b);
1765:   VecGetArray(xx,&x);
1766:   t  = (MatScalar *)a->solve_work;

1768:   ISGetIndices(isrow,&rout); r = rout;
1769:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1771:   /* forward solve the lower triangular */
1772:   idx    = 4*(*r++);
1773:   t[0] = (MatScalar)b[idx];
1774:   t[1] = (MatScalar)b[1+idx];
1775:   t[2] = (MatScalar)b[2+idx];
1776:   t[3] = (MatScalar)b[3+idx];
1777:   for (i=1; i<n; i++) {
1778:     v     = aa + 16*ai[i];
1779:     vi    = aj + ai[i];
1780:     nz    = diag[i] - ai[i];
1781:     idx   = 4*(*r++);
1782:     s1 = (MatScalar)b[idx];
1783:     s2 = (MatScalar)b[1+idx];
1784:     s3 = (MatScalar)b[2+idx];
1785:     s4 = (MatScalar)b[3+idx];
1786:     while (nz--) {
1787:       idx   = 4*(*vi++);
1788:       x1  = t[idx];
1789:       x2  = t[1+idx];
1790:       x3  = t[2+idx];
1791:       x4  = t[3+idx];
1792:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796:       v    += 16;
1797:     }
1798:     idx        = 4*i;
1799:     t[idx]   = s1;
1800:     t[1+idx] = s2;
1801:     t[2+idx] = s3;
1802:     t[3+idx] = s4;
1803:   }
1804:   /* backward solve the upper triangular */
1805:   for (i=n-1; i>=0; i--){
1806:     v    = aa + 16*diag[i] + 16;
1807:     vi   = aj + diag[i] + 1;
1808:     nz   = ai[i+1] - diag[i] - 1;
1809:     idt  = 4*i;
1810:     s1 = t[idt];
1811:     s2 = t[1+idt];
1812:     s3 = t[2+idt];
1813:     s4 = t[3+idt];
1814:     while (nz--) {
1815:       idx   = 4*(*vi++);
1816:       x1  = t[idx];
1817:       x2  = t[1+idx];
1818:       x3  = t[2+idx];
1819:       x4  = t[3+idx];
1820:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824:       v += 16;
1825:     }
1826:     idc      = 4*(*c--);
1827:     v        = aa + 16*diag[i];
1828:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832:     x[idc]   = (PetscScalar)t[idt];
1833:     x[1+idc] = (PetscScalar)t[1+idt];
1834:     x[2+idc] = (PetscScalar)t[2+idt];
1835:     x[3+idc] = (PetscScalar)t[3+idt];
1836:  }

1838:   ISRestoreIndices(isrow,&rout);
1839:   ISRestoreIndices(iscol,&cout);
1840:   VecRestoreArray(bb,&b);
1841:   VecRestoreArray(xx,&x);
1842:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843:   return(0);
1844: }

1846: #if defined (PETSC_HAVE_SSE)

1848: #include PETSC_HAVE_SSE

1852: int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1853: {
1854:   /* 
1855:      Note: This code uses demotion of double
1856:      to float when performing the mixed-mode computation.
1857:      This may not be numerically reasonable for all applications.
1858:   */
1859:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1860:   IS              iscol=a->col,isrow=a->row;
1861:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1862:   int             *diag = a->diag,ai16;
1863:   MatScalar       *aa=a->a,*v;
1864:   PetscScalar     *x,*b,*t;

1866:   /* Make space in temp stack for 16 Byte Aligned arrays */
1867:   float           ssealignedspace[11],*tmps,*tmpx;
1868:   unsigned long   offset;
1869: 
1871:   SSE_SCOPE_BEGIN;

1873:     offset = (unsigned long)ssealignedspace % 16;
1874:     if (offset) offset = (16 - offset)/4;
1875:     tmps = &ssealignedspace[offset];
1876:     tmpx = &ssealignedspace[offset+4];
1877:     PREFETCH_NTA(aa+16*ai[1]);

1879:     VecGetArray(bb,&b);
1880:     VecGetArray(xx,&x);
1881:     t  = a->solve_work;

1883:     ISGetIndices(isrow,&rout); r = rout;
1884:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1886:     /* forward solve the lower triangular */
1887:     idx  = 4*(*r++);
1888:     t[0] = b[idx];   t[1] = b[1+idx];
1889:     t[2] = b[2+idx]; t[3] = b[3+idx];
1890:     v    =  aa + 16*ai[1];

1892:     for (i=1; i<n;) {
1893:       PREFETCH_NTA(&v[8]);
1894:       vi   =  aj      + ai[i];
1895:       nz   =  diag[i] - ai[i];
1896:       idx  =  4*(*r++);

1898:       /* Demote sum from double to float */
1899:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1900:       LOAD_PS(tmps,XMM7);

1902:       while (nz--) {
1903:         PREFETCH_NTA(&v[16]);
1904:         idx = 4*(*vi++);
1905: 
1906:         /* Demote solution (so far) from double to float */
1907:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1909:         /* 4x4 Matrix-Vector product with negative accumulation: */
1910:         SSE_INLINE_BEGIN_2(tmpx,v)
1911:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1913:           /* First Column */
1914:           SSE_COPY_PS(XMM0,XMM6)
1915:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1916:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1917:           SSE_SUB_PS(XMM7,XMM0)
1918: 
1919:           /* Second Column */
1920:           SSE_COPY_PS(XMM1,XMM6)
1921:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1922:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1923:           SSE_SUB_PS(XMM7,XMM1)
1924: 
1925:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1926: 
1927:           /* Third Column */
1928:           SSE_COPY_PS(XMM2,XMM6)
1929:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1930:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1931:           SSE_SUB_PS(XMM7,XMM2)

1933:           /* Fourth Column */
1934:           SSE_COPY_PS(XMM3,XMM6)
1935:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1936:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1937:           SSE_SUB_PS(XMM7,XMM3)
1938:         SSE_INLINE_END_2
1939: 
1940:         v  += 16;
1941:       }
1942:       idx = 4*i;
1943:       v   = aa + 16*ai[++i];
1944:       PREFETCH_NTA(v);
1945:       STORE_PS(tmps,XMM7);

1947:       /* Promote result from float to double */
1948:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1949:     }
1950:     /* backward solve the upper triangular */
1951:     idt  = 4*(n-1);
1952:     ai16 = 16*diag[n-1];
1953:     v    = aa + ai16 + 16;
1954:     for (i=n-1; i>=0;){
1955:       PREFETCH_NTA(&v[8]);
1956:       vi = aj + diag[i] + 1;
1957:       nz = ai[i+1] - diag[i] - 1;
1958: 
1959:       /* Demote accumulator from double to float */
1960:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1961:       LOAD_PS(tmps,XMM7);

1963:       while (nz--) {
1964:         PREFETCH_NTA(&v[16]);
1965:         idx = 4*(*vi++);

1967:         /* Demote solution (so far) from double to float */
1968:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

1970:         /* 4x4 Matrix-Vector Product with negative accumulation: */
1971:         SSE_INLINE_BEGIN_2(tmpx,v)
1972:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1974:           /* First Column */
1975:           SSE_COPY_PS(XMM0,XMM6)
1976:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1977:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1978:           SSE_SUB_PS(XMM7,XMM0)

1980:           /* Second Column */
1981:           SSE_COPY_PS(XMM1,XMM6)
1982:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1983:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1984:           SSE_SUB_PS(XMM7,XMM1)

1986:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1987: 
1988:           /* Third Column */
1989:           SSE_COPY_PS(XMM2,XMM6)
1990:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1991:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1992:           SSE_SUB_PS(XMM7,XMM2)

1994:           /* Fourth Column */
1995:           SSE_COPY_PS(XMM3,XMM6)
1996:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1997:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1998:           SSE_SUB_PS(XMM7,XMM3)
1999:         SSE_INLINE_END_2
2000:         v  += 16;
2001:       }
2002:       v    = aa + ai16;
2003:       ai16 = 16*diag[--i];
2004:       PREFETCH_NTA(aa+ai16+16);
2005:       /* 
2006:          Scale the result by the diagonal 4x4 block, 
2007:          which was inverted as part of the factorization
2008:       */
2009:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2010:         /* First Column */
2011:         SSE_COPY_PS(XMM0,XMM7)
2012:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2013:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2015:         /* Second Column */
2016:         SSE_COPY_PS(XMM1,XMM7)
2017:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2018:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2019:         SSE_ADD_PS(XMM0,XMM1)

2021:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2022: 
2023:         /* Third Column */
2024:         SSE_COPY_PS(XMM2,XMM7)
2025:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2026:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2027:         SSE_ADD_PS(XMM0,XMM2)

2029:         /* Fourth Column */
2030:         SSE_COPY_PS(XMM3,XMM7)
2031:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2032:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2033:         SSE_ADD_PS(XMM0,XMM3)
2034: 
2035:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2036:       SSE_INLINE_END_3

2038:       /* Promote solution from float to double */
2039:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2041:       /* Apply reordering to t and stream into x.    */
2042:       /* This way, x doesn't pollute the cache.      */
2043:       /* Be careful with size: 2 doubles = 4 floats! */
2044:       idc  = 4*(*c--);
2045:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2046:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2047:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2048:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2049:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2050:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2051:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2052:       SSE_INLINE_END_2
2053:       v    = aa + ai16 + 16;
2054:       idt -= 4;
2055:     }

2057:     ISRestoreIndices(isrow,&rout);
2058:     ISRestoreIndices(iscol,&cout);
2059:     VecRestoreArray(bb,&b);
2060:     VecRestoreArray(xx,&x);
2061:     PetscLogFlops(2*16*(a->nz) - 4*A->n);
2062:   SSE_SCOPE_END;
2063:   return(0);
2064: }

2066: #endif


2069: /*
2070:       Special case where the matrix was ILU(0) factored in the natural
2071:    ordering. This eliminates the need for the column and row permutation.
2072: */
2075: int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2076: {
2077:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2078:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2079:   int             ierr,*diag = a->diag;
2080:   MatScalar       *aa=a->a;
2081:   PetscScalar     *x,*b;

2084:   VecGetArray(bb,&b);
2085:   VecGetArray(xx,&x);

2087: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2088:   {
2089:     static PetscScalar w[2000]; /* very BAD need to fix */
2090:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2091:   }
2092: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2093:   {
2094:     static PetscScalar w[2000]; /* very BAD need to fix */
2095:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2096:   }
2097: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2098:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099: #else
2100:   {
2101:     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2102:     MatScalar    *v;
2103:     int          jdx,idt,idx,nz,*vi,i,ai16;

2105:   /* forward solve the lower triangular */
2106:   idx    = 0;
2107:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2108:   for (i=1; i<n; i++) {
2109:     v     =  aa      + 16*ai[i];
2110:     vi    =  aj      + ai[i];
2111:     nz    =  diag[i] - ai[i];
2112:     idx   +=  4;
2113:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2114:     while (nz--) {
2115:       jdx   = 4*(*vi++);
2116:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2121:       v    += 16;
2122:     }
2123:     x[idx]   = s1;
2124:     x[1+idx] = s2;
2125:     x[2+idx] = s3;
2126:     x[3+idx] = s4;
2127:   }
2128:   /* backward solve the upper triangular */
2129:   idt = 4*(n-1);
2130:   for (i=n-1; i>=0; i--){
2131:     ai16 = 16*diag[i];
2132:     v    = aa + ai16 + 16;
2133:     vi   = aj + diag[i] + 1;
2134:     nz   = ai[i+1] - diag[i] - 1;
2135:     s1 = x[idt];  s2 = x[1+idt];
2136:     s3 = x[2+idt];s4 = x[3+idt];
2137:     while (nz--) {
2138:       idx   = 4*(*vi++);
2139:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2144:       v    += 16;
2145:     }
2146:     v        = aa + ai16;
2147:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151:     idt -= 4;
2152:   }
2153:   }
2154: #endif

2156:   VecRestoreArray(bb,&b);
2157:   VecRestoreArray(xx,&x);
2158:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2159:   return(0);
2160: }

2164: int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165: {
2166:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168:   int             ierr,*diag = a->diag;
2169:   MatScalar       *aa=a->a;
2170:   PetscScalar     *x,*b;

2173:   VecGetArray(bb,&b);
2174:   VecGetArray(xx,&x);

2176:   {
2177:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178:     MatScalar  *v,*t=(MatScalar *)x;
2179:     int        jdx,idt,idx,nz,*vi,i,ai16;

2181:     /* forward solve the lower triangular */
2182:     idx  = 0;
2183:     t[0] = (MatScalar)b[0];
2184:     t[1] = (MatScalar)b[1];
2185:     t[2] = (MatScalar)b[2];
2186:     t[3] = (MatScalar)b[3];
2187:     for (i=1; i<n; i++) {
2188:       v     =  aa      + 16*ai[i];
2189:       vi    =  aj      + ai[i];
2190:       nz    =  diag[i] - ai[i];
2191:       idx   +=  4;
2192:       s1 = (MatScalar)b[idx];
2193:       s2 = (MatScalar)b[1+idx];
2194:       s3 = (MatScalar)b[2+idx];
2195:       s4 = (MatScalar)b[3+idx];
2196:       while (nz--) {
2197:         jdx = 4*(*vi++);
2198:         x1  = t[jdx];
2199:         x2  = t[1+jdx];
2200:         x3  = t[2+jdx];
2201:         x4  = t[3+jdx];
2202:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206:         v    += 16;
2207:       }
2208:       t[idx]   = s1;
2209:       t[1+idx] = s2;
2210:       t[2+idx] = s3;
2211:       t[3+idx] = s4;
2212:     }
2213:     /* backward solve the upper triangular */
2214:     idt = 4*(n-1);
2215:     for (i=n-1; i>=0; i--){
2216:       ai16 = 16*diag[i];
2217:       v    = aa + ai16 + 16;
2218:       vi   = aj + diag[i] + 1;
2219:       nz   = ai[i+1] - diag[i] - 1;
2220:       s1   = t[idt];
2221:       s2   = t[1+idt];
2222:       s3   = t[2+idt];
2223:       s4   = t[3+idt];
2224:       while (nz--) {
2225:         idx = 4*(*vi++);
2226:         x1  = (MatScalar)x[idx];
2227:         x2  = (MatScalar)x[1+idx];
2228:         x3  = (MatScalar)x[2+idx];
2229:         x4  = (MatScalar)x[3+idx];
2230:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234:         v    += 16;
2235:       }
2236:       v        = aa + ai16;
2237:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241:       idt -= 4;
2242:     }
2243:   }

2245:   VecRestoreArray(bb,&b);
2246:   VecRestoreArray(xx,&x);
2247:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248:   return(0);
2249: }

2251: #if defined (PETSC_HAVE_SSE)

2253: #include PETSC_HAVE_SSE
2254: #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */
2257: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2258: {
2259:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2260:   unsigned short *aj=(unsigned short *)a->j;
2261:   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2262:   MatScalar      *aa=a->a;
2263:   PetscScalar    *x,*b;

2266:   SSE_SCOPE_BEGIN;
2267:   /* 
2268:      Note: This code currently uses demotion of double
2269:      to float when performing the mixed-mode computation.
2270:      This may not be numerically reasonable for all applications.
2271:   */
2272:   PREFETCH_NTA(aa+16*ai[1]);

2274:   VecGetArrayFast(bb,&b);
2275:   VecGetArrayFast(xx,&x);
2276:   {
2277:     /* x will first be computed in single precision then promoted inplace to double */
2278:     MatScalar      *v,*t=(MatScalar *)x;
2279:     int            nz,i,idt,ai16;
2280:     unsigned int   jdx,idx;
2281:     unsigned short *vi;
2282:     /* Forward solve the lower triangular factor. */

2284:     /* First block is the identity. */
2285:     idx  = 0;
2286:     CONVERT_DOUBLE4_FLOAT4(t,b);
2287:     v    =  aa + 16*((unsigned int)ai[1]);

2289:     for (i=1; i<n;) {
2290:       PREFETCH_NTA(&v[8]);
2291:       vi   =  aj      + ai[i];
2292:       nz   =  diag[i] - ai[i];
2293:       idx +=  4;

2295:       /* Demote RHS from double to float. */
2296:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2297:       LOAD_PS(&t[idx],XMM7);

2299:       while (nz--) {
2300:         PREFETCH_NTA(&v[16]);
2301:         jdx = 4*((unsigned int)(*vi++));
2302: 
2303:         /* 4x4 Matrix-Vector product with negative accumulation: */
2304:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2305:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2307:           /* First Column */
2308:           SSE_COPY_PS(XMM0,XMM6)
2309:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2310:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2311:           SSE_SUB_PS(XMM7,XMM0)

2313:           /* Second Column */
2314:           SSE_COPY_PS(XMM1,XMM6)
2315:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2316:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2317:           SSE_SUB_PS(XMM7,XMM1)

2319:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2320: 
2321:           /* Third Column */
2322:           SSE_COPY_PS(XMM2,XMM6)
2323:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2324:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2325:           SSE_SUB_PS(XMM7,XMM2)

2327:           /* Fourth Column */
2328:           SSE_COPY_PS(XMM3,XMM6)
2329:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2330:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2331:           SSE_SUB_PS(XMM7,XMM3)
2332:         SSE_INLINE_END_2
2333: 
2334:         v  += 16;
2335:       }
2336:       v    =  aa + 16*ai[++i];
2337:       PREFETCH_NTA(v);
2338:       STORE_PS(&t[idx],XMM7);
2339:     }

2341:     /* Backward solve the upper triangular factor.*/

2343:     idt  = 4*(n-1);
2344:     ai16 = 16*diag[n-1];
2345:     v    = aa + ai16 + 16;
2346:     for (i=n-1; i>=0;){
2347:       PREFETCH_NTA(&v[8]);
2348:       vi = aj + diag[i] + 1;
2349:       nz = ai[i+1] - diag[i] - 1;
2350: 
2351:       LOAD_PS(&t[idt],XMM7);

2353:       while (nz--) {
2354:         PREFETCH_NTA(&v[16]);
2355:         idx = 4*((unsigned int)(*vi++));

2357:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2358:         SSE_INLINE_BEGIN_2(&t[idx],v)
2359:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2361:           /* First Column */
2362:           SSE_COPY_PS(XMM0,XMM6)
2363:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2364:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2365:           SSE_SUB_PS(XMM7,XMM0)

2367:           /* Second Column */
2368:           SSE_COPY_PS(XMM1,XMM6)
2369:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2370:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2371:           SSE_SUB_PS(XMM7,XMM1)

2373:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2374: 
2375:           /* Third Column */
2376:           SSE_COPY_PS(XMM2,XMM6)
2377:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2378:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2379:           SSE_SUB_PS(XMM7,XMM2)

2381:           /* Fourth Column */
2382:           SSE_COPY_PS(XMM3,XMM6)
2383:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2384:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2385:           SSE_SUB_PS(XMM7,XMM3)
2386:         SSE_INLINE_END_2
2387:         v  += 16;
2388:       }
2389:       v    = aa + ai16;
2390:       ai16 = 16*diag[--i];
2391:       PREFETCH_NTA(aa+ai16+16);
2392:       /* 
2393:          Scale the result by the diagonal 4x4 block, 
2394:          which was inverted as part of the factorization
2395:       */
2396:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2397:         /* First Column */
2398:         SSE_COPY_PS(XMM0,XMM7)
2399:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2400:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2402:         /* Second Column */
2403:         SSE_COPY_PS(XMM1,XMM7)
2404:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2405:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2406:         SSE_ADD_PS(XMM0,XMM1)

2408:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2409: 
2410:         /* Third Column */
2411:         SSE_COPY_PS(XMM2,XMM7)
2412:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2413:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2414:         SSE_ADD_PS(XMM0,XMM2)

2416:         /* Fourth Column */
2417:         SSE_COPY_PS(XMM3,XMM7)
2418:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2419:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2420:         SSE_ADD_PS(XMM0,XMM3)

2422:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2423:       SSE_INLINE_END_3

2425:       v    = aa + ai16 + 16;
2426:       idt -= 4;
2427:     }

2429:     /* Convert t from single precision back to double precision (inplace)*/
2430:     idt = 4*(n-1);
2431:     for (i=n-1;i>=0;i--) {
2432:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2433:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2434:       PetscScalar *xtemp=&x[idt];
2435:       MatScalar   *ttemp=&t[idt];
2436:       xtemp[3] = (PetscScalar)ttemp[3];
2437:       xtemp[2] = (PetscScalar)ttemp[2];
2438:       xtemp[1] = (PetscScalar)ttemp[1];
2439:       xtemp[0] = (PetscScalar)ttemp[0];
2440:       idt -= 4;
2441:     }

2443:   } /* End of artificial scope. */
2444:   VecRestoreArrayFast(bb,&b);
2445:   VecRestoreArrayFast(xx,&x);
2446:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2447:   SSE_SCOPE_END;
2448:   return(0);
2449: }

2453: int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2454: {
2455:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2456:   int            *aj=a->j;
2457:   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
2458:   MatScalar      *aa=a->a;
2459:   PetscScalar    *x,*b;

2462:   SSE_SCOPE_BEGIN;
2463:   /* 
2464:      Note: This code currently uses demotion of double
2465:      to float when performing the mixed-mode computation.
2466:      This may not be numerically reasonable for all applications.
2467:   */
2468:   PREFETCH_NTA(aa+16*ai[1]);

2470:   VecGetArrayFast(bb,&b);
2471:   VecGetArrayFast(xx,&x);
2472:   {
2473:     /* x will first be computed in single precision then promoted inplace to double */
2474:     MatScalar *v,*t=(MatScalar *)x;
2475:     int       nz,i,idt,ai16;
2476:     int       jdx,idx;
2477:     int       *vi;
2478:     /* Forward solve the lower triangular factor. */

2480:     /* First block is the identity. */
2481:     idx  = 0;
2482:     CONVERT_DOUBLE4_FLOAT4(t,b);
2483:     v    =  aa + 16*ai[1];

2485:     for (i=1; i<n;) {
2486:       PREFETCH_NTA(&v[8]);
2487:       vi   =  aj      + ai[i];
2488:       nz   =  diag[i] - ai[i];
2489:       idx +=  4;

2491:       /* Demote RHS from double to float. */
2492:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2493:       LOAD_PS(&t[idx],XMM7);

2495:       while (nz--) {
2496:         PREFETCH_NTA(&v[16]);
2497:         jdx = 4*(*vi++);
2498: /*          jdx = *vi++; */
2499: 
2500:         /* 4x4 Matrix-Vector product with negative accumulation: */
2501:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2502:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2504:           /* First Column */
2505:           SSE_COPY_PS(XMM0,XMM6)
2506:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2507:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2508:           SSE_SUB_PS(XMM7,XMM0)

2510:           /* Second Column */
2511:           SSE_COPY_PS(XMM1,XMM6)
2512:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2513:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2514:           SSE_SUB_PS(XMM7,XMM1)

2516:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2517: 
2518:           /* Third Column */
2519:           SSE_COPY_PS(XMM2,XMM6)
2520:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2521:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2522:           SSE_SUB_PS(XMM7,XMM2)

2524:           /* Fourth Column */
2525:           SSE_COPY_PS(XMM3,XMM6)
2526:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2527:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2528:           SSE_SUB_PS(XMM7,XMM3)
2529:         SSE_INLINE_END_2
2530: 
2531:         v  += 16;
2532:       }
2533:       v    =  aa + 16*ai[++i];
2534:       PREFETCH_NTA(v);
2535:       STORE_PS(&t[idx],XMM7);
2536:     }

2538:     /* Backward solve the upper triangular factor.*/

2540:     idt  = 4*(n-1);
2541:     ai16 = 16*diag[n-1];
2542:     v    = aa + ai16 + 16;
2543:     for (i=n-1; i>=0;){
2544:       PREFETCH_NTA(&v[8]);
2545:       vi = aj + diag[i] + 1;
2546:       nz = ai[i+1] - diag[i] - 1;
2547: 
2548:       LOAD_PS(&t[idt],XMM7);

2550:       while (nz--) {
2551:         PREFETCH_NTA(&v[16]);
2552:         idx = 4*(*vi++);
2553: /*          idx = *vi++; */

2555:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2556:         SSE_INLINE_BEGIN_2(&t[idx],v)
2557:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2559:           /* First Column */
2560:           SSE_COPY_PS(XMM0,XMM6)
2561:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2562:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2563:           SSE_SUB_PS(XMM7,XMM0)

2565:           /* Second Column */
2566:           SSE_COPY_PS(XMM1,XMM6)
2567:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2568:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2569:           SSE_SUB_PS(XMM7,XMM1)

2571:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2572: 
2573:           /* Third Column */
2574:           SSE_COPY_PS(XMM2,XMM6)
2575:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2576:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2577:           SSE_SUB_PS(XMM7,XMM2)

2579:           /* Fourth Column */
2580:           SSE_COPY_PS(XMM3,XMM6)
2581:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2582:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2583:           SSE_SUB_PS(XMM7,XMM3)
2584:         SSE_INLINE_END_2
2585:         v  += 16;
2586:       }
2587:       v    = aa + ai16;
2588:       ai16 = 16*diag[--i];
2589:       PREFETCH_NTA(aa+ai16+16);
2590:       /* 
2591:          Scale the result by the diagonal 4x4 block, 
2592:          which was inverted as part of the factorization
2593:       */
2594:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2595:         /* First Column */
2596:         SSE_COPY_PS(XMM0,XMM7)
2597:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2598:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2600:         /* Second Column */
2601:         SSE_COPY_PS(XMM1,XMM7)
2602:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2603:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2604:         SSE_ADD_PS(XMM0,XMM1)

2606:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2607: 
2608:         /* Third Column */
2609:         SSE_COPY_PS(XMM2,XMM7)
2610:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2612:         SSE_ADD_PS(XMM0,XMM2)

2614:         /* Fourth Column */
2615:         SSE_COPY_PS(XMM3,XMM7)
2616:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2618:         SSE_ADD_PS(XMM0,XMM3)

2620:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2621:       SSE_INLINE_END_3

2623:       v    = aa + ai16 + 16;
2624:       idt -= 4;
2625:     }

2627:     /* Convert t from single precision back to double precision (inplace)*/
2628:     idt = 4*(n-1);
2629:     for (i=n-1;i>=0;i--) {
2630:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2631:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2632:       PetscScalar *xtemp=&x[idt];
2633:       MatScalar   *ttemp=&t[idt];
2634:       xtemp[3] = (PetscScalar)ttemp[3];
2635:       xtemp[2] = (PetscScalar)ttemp[2];
2636:       xtemp[1] = (PetscScalar)ttemp[1];
2637:       xtemp[0] = (PetscScalar)ttemp[0];
2638:       idt -= 4;
2639:     }

2641:   } /* End of artificial scope. */
2642:   VecRestoreArrayFast(bb,&b);
2643:   VecRestoreArrayFast(xx,&x);
2644:   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2645:   SSE_SCOPE_END;
2646:   return(0);
2647: }

2649: #endif
2652: int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2653: {
2654:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2655:   IS              iscol=a->col,isrow=a->row;
2656:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2657:   int             *diag = a->diag;
2658:   MatScalar       *aa=a->a,*v;
2659:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;

2662:   VecGetArray(bb,&b);
2663:   VecGetArray(xx,&x);
2664:   t  = a->solve_work;

2666:   ISGetIndices(isrow,&rout); r = rout;
2667:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2669:   /* forward solve the lower triangular */
2670:   idx    = 3*(*r++);
2671:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2672:   for (i=1; i<n; i++) {
2673:     v     = aa + 9*ai[i];
2674:     vi    = aj + ai[i];
2675:     nz    = diag[i] - ai[i];
2676:     idx   = 3*(*r++);
2677:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2678:     while (nz--) {
2679:       idx   = 3*(*vi++);
2680:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2681:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2682:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2683:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2684:       v += 9;
2685:     }
2686:     idx = 3*i;
2687:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2688:   }
2689:   /* backward solve the upper triangular */
2690:   for (i=n-1; i>=0; i--){
2691:     v    = aa + 9*diag[i] + 9;
2692:     vi   = aj + diag[i] + 1;
2693:     nz   = ai[i+1] - diag[i] - 1;
2694:     idt  = 3*i;
2695:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2696:     while (nz--) {
2697:       idx   = 3*(*vi++);
2698:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2699:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2700:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2701:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2702:       v += 9;
2703:     }
2704:     idc = 3*(*c--);
2705:     v   = aa + 9*diag[i];
2706:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2707:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2708:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2709:   }
2710:   ISRestoreIndices(isrow,&rout);
2711:   ISRestoreIndices(iscol,&cout);
2712:   VecRestoreArray(bb,&b);
2713:   VecRestoreArray(xx,&x);
2714:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2715:   return(0);
2716: }

2718: /*
2719:       Special case where the matrix was ILU(0) factored in the natural
2720:    ordering. This eliminates the need for the column and row permutation.
2721: */
2724: int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2725: {
2726:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2727:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2728:   int             ierr,*diag = a->diag;
2729:   MatScalar       *aa=a->a,*v;
2730:   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
2731:   int             jdx,idt,idx,nz,*vi,i;

2734:   VecGetArray(bb,&b);
2735:   VecGetArray(xx,&x);


2738:   /* forward solve the lower triangular */
2739:   idx    = 0;
2740:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2741:   for (i=1; i<n; i++) {
2742:     v     =  aa      + 9*ai[i];
2743:     vi    =  aj      + ai[i];
2744:     nz    =  diag[i] - ai[i];
2745:     idx   +=  3;
2746:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2747:     while (nz--) {
2748:       jdx   = 3*(*vi++);
2749:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2750:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2751:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2752:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2753:       v    += 9;
2754:     }
2755:     x[idx]   = s1;
2756:     x[1+idx] = s2;
2757:     x[2+idx] = s3;
2758:   }
2759:   /* backward solve the upper triangular */
2760:   for (i=n-1; i>=0; i--){
2761:     v    = aa + 9*diag[i] + 9;
2762:     vi   = aj + diag[i] + 1;
2763:     nz   = ai[i+1] - diag[i] - 1;
2764:     idt  = 3*i;
2765:     s1 = x[idt];  s2 = x[1+idt];
2766:     s3 = x[2+idt];
2767:     while (nz--) {
2768:       idx   = 3*(*vi++);
2769:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2770:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2771:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2772:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2773:       v    += 9;
2774:     }
2775:     v        = aa +  9*diag[i];
2776:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2777:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2778:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2779:   }

2781:   VecRestoreArray(bb,&b);
2782:   VecRestoreArray(xx,&x);
2783:   PetscLogFlops(2*9*(a->nz) - 3*A->n);
2784:   return(0);
2785: }

2789: int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2790: {
2791:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2792:   IS              iscol=a->col,isrow=a->row;
2793:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2794:   int             *diag = a->diag;
2795:   MatScalar       *aa=a->a,*v;
2796:   PetscScalar     *x,*b,s1,s2,x1,x2,*t;

2799:   VecGetArray(bb,&b);
2800:   VecGetArray(xx,&x);
2801:   t  = a->solve_work;

2803:   ISGetIndices(isrow,&rout); r = rout;
2804:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2806:   /* forward solve the lower triangular */
2807:   idx    = 2*(*r++);
2808:   t[0] = b[idx]; t[1] = b[1+idx];
2809:   for (i=1; i<n; i++) {
2810:     v     = aa + 4*ai[i];
2811:     vi    = aj + ai[i];
2812:     nz    = diag[i] - ai[i];
2813:     idx   = 2*(*r++);
2814:     s1  = b[idx]; s2 = b[1+idx];
2815:     while (nz--) {
2816:       idx   = 2*(*vi++);
2817:       x1    = t[idx]; x2 = t[1+idx];
2818:       s1 -= v[0]*x1 + v[2]*x2;
2819:       s2 -= v[1]*x1 + v[3]*x2;
2820:       v += 4;
2821:     }
2822:     idx = 2*i;
2823:     t[idx] = s1; t[1+idx] = s2;
2824:   }
2825:   /* backward solve the upper triangular */
2826:   for (i=n-1; i>=0; i--){
2827:     v    = aa + 4*diag[i] + 4;
2828:     vi   = aj + diag[i] + 1;
2829:     nz   = ai[i+1] - diag[i] - 1;
2830:     idt  = 2*i;
2831:     s1 = t[idt]; s2 = t[1+idt];
2832:     while (nz--) {
2833:       idx   = 2*(*vi++);
2834:       x1    = t[idx]; x2 = t[1+idx];
2835:       s1 -= v[0]*x1 + v[2]*x2;
2836:       s2 -= v[1]*x1 + v[3]*x2;
2837:       v += 4;
2838:     }
2839:     idc = 2*(*c--);
2840:     v   = aa + 4*diag[i];
2841:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2842:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2843:   }
2844:   ISRestoreIndices(isrow,&rout);
2845:   ISRestoreIndices(iscol,&cout);
2846:   VecRestoreArray(bb,&b);
2847:   VecRestoreArray(xx,&x);
2848:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2849:   return(0);
2850: }

2852: /*
2853:       Special case where the matrix was ILU(0) factored in the natural
2854:    ordering. This eliminates the need for the column and row permutation.
2855: */
2858: int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2859: {
2860:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2861:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2862:   int             ierr,*diag = a->diag;
2863:   MatScalar       *aa=a->a,*v;
2864:   PetscScalar     *x,*b,s1,s2,x1,x2;
2865:   int             jdx,idt,idx,nz,*vi,i;

2868:   VecGetArray(bb,&b);
2869:   VecGetArray(xx,&x);

2871:   /* forward solve the lower triangular */
2872:   idx    = 0;
2873:   x[0]   = b[0]; x[1] = b[1];
2874:   for (i=1; i<n; i++) {
2875:     v     =  aa      + 4*ai[i];
2876:     vi    =  aj      + ai[i];
2877:     nz    =  diag[i] - ai[i];
2878:     idx   +=  2;
2879:     s1  =  b[idx];s2 = b[1+idx];
2880:     while (nz--) {
2881:       jdx   = 2*(*vi++);
2882:       x1    = x[jdx];x2 = x[1+jdx];
2883:       s1 -= v[0]*x1 + v[2]*x2;
2884:       s2 -= v[1]*x1 + v[3]*x2;
2885:       v    += 4;
2886:     }
2887:     x[idx]   = s1;
2888:     x[1+idx] = s2;
2889:   }
2890:   /* backward solve the upper triangular */
2891:   for (i=n-1; i>=0; i--){
2892:     v    = aa + 4*diag[i] + 4;
2893:     vi   = aj + diag[i] + 1;
2894:     nz   = ai[i+1] - diag[i] - 1;
2895:     idt  = 2*i;
2896:     s1 = x[idt];  s2 = x[1+idt];
2897:     while (nz--) {
2898:       idx   = 2*(*vi++);
2899:       x1    = x[idx];   x2 = x[1+idx];
2900:       s1 -= v[0]*x1 + v[2]*x2;
2901:       s2 -= v[1]*x1 + v[3]*x2;
2902:       v    += 4;
2903:     }
2904:     v        = aa +  4*diag[i];
2905:     x[idt]   = v[0]*s1 + v[2]*s2;
2906:     x[1+idt] = v[1]*s1 + v[3]*s2;
2907:   }

2909:   VecRestoreArray(bb,&b);
2910:   VecRestoreArray(xx,&x);
2911:   PetscLogFlops(2*4*(a->nz) - 2*A->n);
2912:   return(0);
2913: }

2917: int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2918: {
2919:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
2920:   IS              iscol=a->col,isrow=a->row;
2921:   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2922:   int             *diag = a->diag;
2923:   MatScalar       *aa=a->a,*v;
2924:   PetscScalar     *x,*b,s1,*t;

2927:   if (!n) return(0);

2929:   VecGetArray(bb,&b);
2930:   VecGetArray(xx,&x);
2931:   t  = a->solve_work;

2933:   ISGetIndices(isrow,&rout); r = rout;
2934:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2936:   /* forward solve the lower triangular */
2937:   t[0] = b[*r++];
2938:   for (i=1; i<n; i++) {
2939:     v     = aa + ai[i];
2940:     vi    = aj + ai[i];
2941:     nz    = diag[i] - ai[i];
2942:     s1  = b[*r++];
2943:     while (nz--) {
2944:       s1 -= (*v++)*t[*vi++];
2945:     }
2946:     t[i] = s1;
2947:   }
2948:   /* backward solve the upper triangular */
2949:   for (i=n-1; i>=0; i--){
2950:     v    = aa + diag[i] + 1;
2951:     vi   = aj + diag[i] + 1;
2952:     nz   = ai[i+1] - diag[i] - 1;
2953:     s1 = t[i];
2954:     while (nz--) {
2955:       s1 -= (*v++)*t[*vi++];
2956:     }
2957:     x[*c--] = t[i] = aa[diag[i]]*s1;
2958:   }

2960:   ISRestoreIndices(isrow,&rout);
2961:   ISRestoreIndices(iscol,&cout);
2962:   VecRestoreArray(bb,&b);
2963:   VecRestoreArray(xx,&x);
2964:   PetscLogFlops(2*1*(a->nz) - A->n);
2965:   return(0);
2966: }
2967: /*
2968:       Special case where the matrix was ILU(0) factored in the natural
2969:    ordering. This eliminates the need for the column and row permutation.
2970: */
2973: int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2974: {
2975:   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2976:   int             n=a->mbs,*ai=a->i,*aj=a->j;
2977:   int             ierr,*diag = a->diag;
2978:   MatScalar       *aa=a->a;
2979:   PetscScalar     *x,*b;
2980:   PetscScalar     s1,x1;
2981:   MatScalar       *v;
2982:   int             jdx,idt,idx,nz,*vi,i;

2985:   VecGetArray(bb,&b);
2986:   VecGetArray(xx,&x);

2988:   /* forward solve the lower triangular */
2989:   idx    = 0;
2990:   x[0]   = b[0];
2991:   for (i=1; i<n; i++) {
2992:     v     =  aa      + ai[i];
2993:     vi    =  aj      + ai[i];
2994:     nz    =  diag[i] - ai[i];
2995:     idx   +=  1;
2996:     s1  =  b[idx];
2997:     while (nz--) {
2998:       jdx   = *vi++;
2999:       x1    = x[jdx];
3000:       s1 -= v[0]*x1;
3001:       v    += 1;
3002:     }
3003:     x[idx]   = s1;
3004:   }
3005:   /* backward solve the upper triangular */
3006:   for (i=n-1; i>=0; i--){
3007:     v    = aa + diag[i] + 1;
3008:     vi   = aj + diag[i] + 1;
3009:     nz   = ai[i+1] - diag[i] - 1;
3010:     idt  = i;
3011:     s1 = x[idt];
3012:     while (nz--) {
3013:       idx   = *vi++;
3014:       x1    = x[idx];
3015:       s1 -= v[0]*x1;
3016:       v    += 1;
3017:     }
3018:     v        = aa +  diag[i];
3019:     x[idt]   = v[0]*s1;
3020:   }
3021:   VecRestoreArray(bb,&b);
3022:   VecRestoreArray(xx,&x);
3023:   PetscLogFlops(2*(a->nz) - A->n);
3024:   return(0);
3025: }

3027: /* ----------------------------------------------------------------*/
3028: /*
3029:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3030:    except that the data structure of Mat_SeqAIJ is slightly different.
3031:    Not a good example of code reuse.
3032: */
3033: EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);

3037: int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3038: {
3039:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3040:   IS          isicol;
3041:   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3042:   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3043:   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3044:   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
3045:   PetscTruth  col_identity,row_identity;
3046:   PetscReal   f;

3049:   f             = info->fill;
3050:   levels        = (int)info->levels;
3051:   diagonal_fill = (int)info->diagonal_fill;
3052:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3053:   ISIdentity(isrow,&row_identity);
3054:   ISIdentity(iscol,&col_identity);

3056:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3057:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3058:     (*fact)->factor = FACTOR_LU;
3059:     b               = (Mat_SeqBAIJ*)(*fact)->data;
3060:     if (!b->diag) {
3061:       MatMarkDiagonal_SeqBAIJ(*fact);
3062:     }
3063:     MatMissingDiagonal_SeqBAIJ(*fact);
3064:     b->row        = isrow;
3065:     b->col        = iscol;
3066:     PetscObjectReference((PetscObject)isrow);
3067:     PetscObjectReference((PetscObject)iscol);
3068:     b->icol       = isicol;
3069:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3070:     PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);
3071:   } else { /* general case perform the symbolic factorization */
3072:     ISGetIndices(isrow,&r);
3073:     ISGetIndices(isicol,&ic);

3075:     /* get new row pointers */
3076:     PetscMalloc((n+1)*sizeof(int),&ainew);
3077:     ainew[0] = 0;
3078:     /* don't know how many column pointers are needed so estimate */
3079:     jmax = (int)(f*ai[n] + 1);
3080:     PetscMalloc((jmax)*sizeof(int),&ajnew);
3081:     /* ajfill is level of fill for each fill entry */
3082:     PetscMalloc((jmax)*sizeof(int),&ajfill);
3083:     /* fill is a linked list of nonzeros in active row */
3084:     PetscMalloc((n+1)*sizeof(int),&fill);
3085:     /* im is level for each filled value */
3086:     PetscMalloc((n+1)*sizeof(int),&im);
3087:     /* dloc is location of diagonal in factor */
3088:     PetscMalloc((n+1)*sizeof(int),&dloc);
3089:     dloc[0]  = 0;
3090:     for (prow=0; prow<n; prow++) {

3092:       /* copy prow into linked list */
3093:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3094:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3095:       xi         = aj + ai[r[prow]];
3096:       fill[n]    = n;
3097:       fill[prow] = -1; /* marker for diagonal entry */
3098:       while (nz--) {
3099:         fm  = n;
3100:         idx = ic[*xi++];
3101:         do {
3102:           m  = fm;
3103:           fm = fill[m];
3104:         } while (fm < idx);
3105:         fill[m]   = idx;
3106:         fill[idx] = fm;
3107:         im[idx]   = 0;
3108:       }

3110:       /* make sure diagonal entry is included */
3111:       if (diagonal_fill && fill[prow] == -1) {
3112:         fm = n;
3113:         while (fill[fm] < prow) fm = fill[fm];
3114:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3115:         fill[fm]   = prow;
3116:         im[prow]   = 0;
3117:         nzf++;
3118:         dcount++;
3119:       }

3121:       nzi = 0;
3122:       row = fill[n];
3123:       while (row < prow) {
3124:         incrlev = im[row] + 1;
3125:         nz      = dloc[row];
3126:         xi      = ajnew  + ainew[row] + nz + 1;
3127:         flev    = ajfill + ainew[row] + nz + 1;
3128:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3129:         fm      = row;
3130:         while (nnz-- > 0) {
3131:           idx = *xi++;
3132:           if (*flev + incrlev > levels) {
3133:             flev++;
3134:             continue;
3135:           }
3136:           do {
3137:             m  = fm;
3138:             fm = fill[m];
3139:           } while (fm < idx);
3140:           if (fm != idx) {
3141:             im[idx]   = *flev + incrlev;
3142:             fill[m]   = idx;
3143:             fill[idx] = fm;
3144:             fm        = idx;
3145:             nzf++;
3146:           } else {
3147:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3148:           }
3149:           flev++;
3150:         }
3151:         row = fill[row];
3152:         nzi++;
3153:       }
3154:       /* copy new filled row into permanent storage */
3155:       ainew[prow+1] = ainew[prow] + nzf;
3156:       if (ainew[prow+1] > jmax) {

3158:         /* estimate how much additional space we will need */
3159:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3160:         /* just double the memory each time */
3161:         int maxadd = jmax;
3162:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3163:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3164:         jmax += maxadd;

3166:         /* allocate a longer ajnew and ajfill */
3167:         PetscMalloc(jmax*sizeof(int),&xi);
3168:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
3169:         PetscFree(ajnew);
3170:         ajnew = xi;
3171:         PetscMalloc(jmax*sizeof(int),&xi);
3172:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
3173:         PetscFree(ajfill);
3174:         ajfill = xi;
3175:         reallocate++; /* count how many reallocations are needed */
3176:       }
3177:       xi          = ajnew + ainew[prow];
3178:       flev        = ajfill + ainew[prow];
3179:       dloc[prow]  = nzi;
3180:       fm          = fill[n];
3181:       while (nzf--) {
3182:         *xi++   = fm;
3183:         *flev++ = im[fm];
3184:         fm      = fill[fm];
3185:       }
3186:       /* make sure row has diagonal entry */
3187:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3188:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3189:     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3190:       }
3191:     }
3192:     PetscFree(ajfill);
3193:     ISRestoreIndices(isrow,&r);
3194:     ISRestoreIndices(isicol,&ic);
3195:     PetscFree(fill);
3196:     PetscFree(im);

3198:     {
3199:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3200:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3201:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3202:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3203:       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3204:       if (diagonal_fill) {
3205:         PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3206:       }
3207:     }

3209:     /* put together the new matrix */
3210:     MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);
3211:     PetscLogObjectParent(*fact,isicol);
3212:     b = (Mat_SeqBAIJ*)(*fact)->data;
3213:     PetscFree(b->imax);
3214:     b->singlemalloc = PETSC_FALSE;
3215:     len = bs2*ainew[n]*sizeof(MatScalar);
3216:     /* the next line frees the default space generated by the Create() */
3217:     PetscFree(b->a);
3218:     PetscFree(b->ilen);
3219:     PetscMalloc(len,&b->a);
3220:     b->j          = ajnew;
3221:     b->i          = ainew;
3222:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3223:     b->diag       = dloc;
3224:     b->ilen       = 0;
3225:     b->imax       = 0;
3226:     b->row        = isrow;
3227:     b->col        = iscol;
3228:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229:     PetscObjectReference((PetscObject)isrow);
3230:     PetscObjectReference((PetscObject)iscol);
3231:     b->icol       = isicol;
3232:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3233:     /* In b structure:  Free imax, ilen, old a, old j.  
3234:        Allocate dloc, solve_work, new a, new j */
3235:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
3236:     b->maxnz          = b->nz = ainew[n];
3237:     (*fact)->factor   = FACTOR_LU;

3239:     (*fact)->info.factor_mallocs    = reallocate;
3240:     (*fact)->info.fill_ratio_given  = f;
3241:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242:   }

3244:   if (row_identity && col_identity) {
3245:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3246:   }
3247:   return(0);
3248: }

3252: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3253: {
3254:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3255:   /* int i,*AJ=a->j,nz=a->nz; */
3257:   /* Undo Column scaling */
3258: /*    while (nz--) { */
3259: /*      AJ[i] = AJ[i]/4; */
3260: /*    } */
3261:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3262:   A->ops->setunfactored = PETSC_NULL;
3263:   return(0);
3264: }

3268: int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3269: {
3270:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3271:   int *AJ=a->j,nz=a->nz;
3272:   unsigned short *aj=(unsigned short *)AJ;
3274:   /* Is this really necessary? */
3275:   while (nz--) {
3276:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3277:   }
3278:   A->ops->setunfactored = PETSC_NULL;
3279:   return(0);
3280: }

3284: int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3285: {
3286:   /*
3287:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3288:       with natural ordering
3289:   */
3290:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;

3293:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3294:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3295:   switch (a->bs) {
3296:   case 1:
3297:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299:     break;
3300:   case 2:
3301:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303:     break;
3304:   case 3:
3305:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307:     break;
3308:   case 4:
3309: #if defined(PETSC_USE_MAT_SINGLE)
3310:     {
3311:       PetscTruth  sse_enabled_local;
3312:       int         ierr;
3313:       PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3314:       if (sse_enabled_local) {
3315: #  if defined(PETSC_HAVE_SSE)
3316:         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3317:         if (n==(unsigned short)n) {
3318:           unsigned short *aj=(unsigned short *)AJ;
3319:           for (i=0;i<nz;i++) {
3320:             aj[i] = (unsigned short)AJ[i];
3321:           }
3322:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3323:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3324:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3325:         } else {
3326:         /* Scale the column indices for easier indexing in MatSolve. */
3327: /*            for (i=0;i<nz;i++) { */
3328: /*              AJ[i] = AJ[i]*4; */
3329: /*            } */
3330:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3331:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3332:           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3333:         }
3334: #  else
3335:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3336:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337: #  endif
3338:       } else {
3339:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340:         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3341:       }
3342:     }
3343: #else
3344:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346: #endif
3347:     break;
3348:   case 5:
3349:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351:     break;
3352:   case 6:
3353:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355:     break;
3356:   case 7:
3357:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358:     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359:     break;
3360:   }
3361:   return(0);
3362: }

3366: int MatSeqBAIJ_UpdateSolvers(Mat A)
3367: {
3368:   /*
3369:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3370:       with natural ordering
3371:   */
3372:   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3373:   IS          row = a->row, col = a->col;
3374:   PetscTruth  row_identity, col_identity;
3375:   PetscTruth  use_natural;
3376:   int         ierr;


3380:   use_natural = PETSC_FALSE;

3382:   ISIdentity(row,&row_identity);
3383:   ISIdentity(col,&col_identity);

3385:   if (row_identity && col_identity) {
3386:     use_natural = PETSC_TRUE;
3387:   } else {
3388:     use_natural = PETSC_FALSE;
3389:   }
3390:   switch (a->bs) {
3391:   case 1:
3392:     if (use_natural) {
3393:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3394:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3395:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3396:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3397:     } else {
3398:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3399:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3400:     }
3401:     break;
3402:   case 2:
3403:     if (use_natural) {
3404:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3405:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3406:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3407:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3408:     } else {
3409:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3410:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3411:     }
3412:     break;
3413:   case 3:
3414:     if (use_natural) {
3415:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3416:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3417:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3418:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3419:     } else {
3420:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3421:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3422:     }
3423:     break;
3424:   case 4:
3425:     {
3426:       PetscTruth sse_enabled_local;
3427:       PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3428:       if (use_natural) {
3429: #if defined(PETSC_USE_MAT_SINGLE)
3430:         if (sse_enabled_local) { /* Natural + Single + SSE */
3431: #  if defined(PETSC_HAVE_SSE)
3432:           int n=a->mbs;
3433:           if (n==(unsigned short)n) {
3434:             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3435:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3436:           } else {
3437:             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3438:             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3439:           }
3440: #  else
3441:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3442:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3443: #  endif
3444:         } else { /* Natural + Single */
3445:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3446:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3447:         }
3448: #else
3449:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3450:         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3451: #endif
3452:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3453:         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3454:       } else { /* Arbitrary ordering */
3455: #if defined(PETSC_USE_MAT_SINGLE)
3456:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3457: #  if defined(PETSC_HAVE_SSE)
3458:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3459:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3460: #  else
3461:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3462:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3463: #  endif
3464:         } else { /* Arbitrary + Single */
3465:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3466:           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3467:         }
3468: #else
3469:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3470: #endif
3471:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3472:       }
3473:     }
3474:     break;
3475:   case 5:
3476:     if (use_natural) {
3477:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3478:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3479:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3480:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3481:     } else {
3482:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3483:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3484:     }
3485:     break;
3486:   case 6:
3487:     if (use_natural) {
3488:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3489:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3490:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3491:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3492:     } else {
3493:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3494:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3495:     }
3496:     break;
3497:   case 7:
3498:     if (use_natural) {
3499:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3500:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3501:       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3502:       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3503:     } else {
3504:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3505:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3506:     }
3507:     break;
3508:   default:
3509:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3510:     break;
3511:   }
3512:   return(0);
3513: }

3517: int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3521:   MatSeqBAIJ_UpdateSolvers(A);
3522:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3523:     (*A->ops->solve)(A,x,y);
3524:   } else {
3525:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3526:   }
3527:   return(0);
3528: }

3532: int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {

3536:   MatSeqBAIJ_UpdateSolvers(A);
3537:   (*A->ops->solvetranspose)(A,x,y);
3538:   return(0);
3539: }