Actual source code: baijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */

 7:  #include src/mat/impls/baij/seq/baij.h
 8:  #include src/inline/ilu.h
 9:  #include src/inline/dot.h

 13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 17:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
 18:   PetscInt       *diag = a->diag;
 19:   MatScalar      *aa=a->a,*v;
 20:   PetscScalar    s1,*x,*b;

 23:   VecCopy(bb,xx);
 24:   VecGetArray(bb,&b);
 25:   VecGetArray(xx,&x);
 26: 
 27:   /* forward solve the U^T */
 28:   for (i=0; i<n; i++) {

 30:     v     = aa + diag[i];
 31:     /* multiply by the inverse of the block diagonal */
 32:     s1    = (*v++)*x[i];
 33:     vi    = aj + diag[i] + 1;
 34:     nz    = ai[i+1] - diag[i] - 1;
 35:     while (nz--) {
 36:       x[*vi++]  -= (*v++)*s1;
 37:     }
 38:     x[i]   = s1;
 39:   }
 40:   /* backward solve the L^T */
 41:   for (i=n-1; i>=0; i--){
 42:     v    = aa + diag[i] - 1;
 43:     vi   = aj + diag[i] - 1;
 44:     nz   = diag[i] - ai[i];
 45:     s1   = x[i];
 46:     while (nz--) {
 47:       x[*vi--]   -=  (*v--)*s1;
 48:     }
 49:   }
 50:   VecRestoreArray(bb,&b);
 51:   VecRestoreArray(xx,&x);
 52:   PetscLogFlops(2*(a->nz) - A->cmap.n);
 53:   return(0);
 54: }

 58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
 59: {
 60:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
 62:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
 63:   PetscInt       *diag = a->diag,oidx;
 64:   MatScalar      *aa=a->a,*v;
 65:   PetscScalar    s1,s2,x1,x2;
 66:   PetscScalar    *x,*b;

 69:   VecCopy(bb,xx);
 70:   VecGetArray(bb,&b);
 71:   VecGetArray(xx,&x);

 73:   /* forward solve the U^T */
 74:   idx = 0;
 75:   for (i=0; i<n; i++) {

 77:     v     = aa + 4*diag[i];
 78:     /* multiply by the inverse of the block diagonal */
 79:     x1 = x[idx];   x2 = x[1+idx];
 80:     s1 = v[0]*x1  +  v[1]*x2;
 81:     s2 = v[2]*x1  +  v[3]*x2;
 82:     v += 4;

 84:     vi    = aj + diag[i] + 1;
 85:     nz    = ai[i+1] - diag[i] - 1;
 86:     while (nz--) {
 87:       oidx = 2*(*vi++);
 88:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 89:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 90:       v  += 4;
 91:     }
 92:     x[idx]   = s1;x[1+idx] = s2;
 93:     idx += 2;
 94:   }
 95:   /* backward solve the L^T */
 96:   for (i=n-1; i>=0; i--){
 97:     v    = aa + 4*diag[i] - 4;
 98:     vi   = aj + diag[i] - 1;
 99:     nz   = diag[i] - ai[i];
100:     idt  = 2*i;
101:     s1   = x[idt];  s2 = x[1+idt];
102:     while (nz--) {
103:       idx   = 2*(*vi--);
104:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106:       v -= 4;
107:     }
108:   }
109:   VecRestoreArray(bb,&b);
110:   VecRestoreArray(xx,&x);
111:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
112:   return(0);
113: }

117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
121:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122:   PetscInt       *diag = a->diag,oidx;
123:   MatScalar      *aa=a->a,*v;
124:   PetscScalar    s1,s2,s3,x1,x2,x3;
125:   PetscScalar    *x,*b;

128:   VecCopy(bb,xx);
129:   VecGetArray(bb,&b);
130:   VecGetArray(xx,&x);

132:   /* forward solve the U^T */
133:   idx = 0;
134:   for (i=0; i<n; i++) {

136:     v     = aa + 9*diag[i];
137:     /* multiply by the inverse of the block diagonal */
138:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142:     v += 9;

144:     vi    = aj + diag[i] + 1;
145:     nz    = ai[i+1] - diag[i] - 1;
146:     while (nz--) {
147:       oidx = 3*(*vi++);
148:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151:       v  += 9;
152:     }
153:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154:     idx += 3;
155:   }
156:   /* backward solve the L^T */
157:   for (i=n-1; i>=0; i--){
158:     v    = aa + 9*diag[i] - 9;
159:     vi   = aj + diag[i] - 1;
160:     nz   = diag[i] - ai[i];
161:     idt  = 3*i;
162:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163:     while (nz--) {
164:       idx   = 3*(*vi--);
165:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168:       v -= 9;
169:     }
170:   }
171:   VecRestoreArray(bb,&b);
172:   VecRestoreArray(xx,&x);
173:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
174:   return(0);
175: }

179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
183:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184:   PetscInt       *diag = a->diag,oidx;
185:   MatScalar      *aa=a->a,*v;
186:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187:   PetscScalar    *x,*b;

190:   VecCopy(bb,xx);
191:   VecGetArray(bb,&b);
192:   VecGetArray(xx,&x);

194:   /* forward solve the U^T */
195:   idx = 0;
196:   for (i=0; i<n; i++) {

198:     v     = aa + 16*diag[i];
199:     /* multiply by the inverse of the block diagonal */
200:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205:     v += 16;

207:     vi    = aj + diag[i] + 1;
208:     nz    = ai[i+1] - diag[i] - 1;
209:     while (nz--) {
210:       oidx = 4*(*vi++);
211:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215:       v  += 16;
216:     }
217:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218:     idx += 4;
219:   }
220:   /* backward solve the L^T */
221:   for (i=n-1; i>=0; i--){
222:     v    = aa + 16*diag[i] - 16;
223:     vi   = aj + diag[i] - 1;
224:     nz   = diag[i] - ai[i];
225:     idt  = 4*i;
226:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227:     while (nz--) {
228:       idx   = 4*(*vi--);
229:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233:       v -= 16;
234:     }
235:   }
236:   VecRestoreArray(bb,&b);
237:   VecRestoreArray(xx,&x);
238:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
239:   return(0);
240: }

244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
248:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249:   PetscInt       *diag = a->diag,oidx;
250:   MatScalar      *aa=a->a,*v;
251:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252:   PetscScalar    *x,*b;

255:   VecCopy(bb,xx);
256:   VecGetArray(bb,&b);
257:   VecGetArray(xx,&x);

259:   /* forward solve the U^T */
260:   idx = 0;
261:   for (i=0; i<n; i++) {

263:     v     = aa + 25*diag[i];
264:     /* multiply by the inverse of the block diagonal */
265:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271:     v += 25;

273:     vi    = aj + diag[i] + 1;
274:     nz    = ai[i+1] - diag[i] - 1;
275:     while (nz--) {
276:       oidx = 5*(*vi++);
277:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282:       v  += 25;
283:     }
284:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285:     idx += 5;
286:   }
287:   /* backward solve the L^T */
288:   for (i=n-1; i>=0; i--){
289:     v    = aa + 25*diag[i] - 25;
290:     vi   = aj + diag[i] - 1;
291:     nz   = diag[i] - ai[i];
292:     idt  = 5*i;
293:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294:     while (nz--) {
295:       idx   = 5*(*vi--);
296:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301:       v -= 25;
302:     }
303:   }
304:   VecRestoreArray(bb,&b);
305:   VecRestoreArray(xx,&x);
306:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
307:   return(0);
308: }

312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
316:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317:   PetscInt       *diag = a->diag,oidx;
318:   MatScalar      *aa=a->a,*v;
319:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320:   PetscScalar    *x,*b;

323:   VecCopy(bb,xx);
324:   VecGetArray(bb,&b);
325:   VecGetArray(xx,&x);

327:   /* forward solve the U^T */
328:   idx = 0;
329:   for (i=0; i<n; i++) {

331:     v     = aa + 36*diag[i];
332:     /* multiply by the inverse of the block diagonal */
333:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334:     x6    = x[5+idx];
335:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341:     v += 36;

343:     vi    = aj + diag[i] + 1;
344:     nz    = ai[i+1] - diag[i] - 1;
345:     while (nz--) {
346:       oidx = 6*(*vi++);
347:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353:       v  += 36;
354:     }
355:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356:     x[5+idx] = s6;
357:     idx += 6;
358:   }
359:   /* backward solve the L^T */
360:   for (i=n-1; i>=0; i--){
361:     v    = aa + 36*diag[i] - 36;
362:     vi   = aj + diag[i] - 1;
363:     nz   = diag[i] - ai[i];
364:     idt  = 6*i;
365:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366:     s6 = x[5+idt];
367:     while (nz--) {
368:       idx   = 6*(*vi--);
369:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375:       v -= 36;
376:     }
377:   }
378:   VecRestoreArray(bb,&b);
379:   VecRestoreArray(xx,&x);
380:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
381:   return(0);
382: }

386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
390:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391:   PetscInt       *diag = a->diag,oidx;
392:   MatScalar      *aa=a->a,*v;
393:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394:   PetscScalar    *x,*b;

397:   VecCopy(bb,xx);
398:   VecGetArray(bb,&b);
399:   VecGetArray(xx,&x);

401:   /* forward solve the U^T */
402:   idx = 0;
403:   for (i=0; i<n; i++) {

405:     v     = aa + 49*diag[i];
406:     /* multiply by the inverse of the block diagonal */
407:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408:     x6    = x[5+idx]; x7 = x[6+idx];
409:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416:     v += 49;

418:     vi    = aj + diag[i] + 1;
419:     nz    = ai[i+1] - diag[i] - 1;
420:     while (nz--) {
421:       oidx = 7*(*vi++);
422:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429:       v  += 49;
430:     }
431:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432:     x[5+idx] = s6;x[6+idx] = s7;
433:     idx += 7;
434:   }
435:   /* backward solve the L^T */
436:   for (i=n-1; i>=0; i--){
437:     v    = aa + 49*diag[i] - 49;
438:     vi   = aj + diag[i] - 1;
439:     nz   = diag[i] - ai[i];
440:     idt  = 7*i;
441:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442:     s6 = x[5+idt];s7 = x[6+idt];
443:     while (nz--) {
444:       idx   = 7*(*vi--);
445:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452:       v -= 49;
453:     }
454:   }
455:   VecRestoreArray(bb,&b);
456:   VecRestoreArray(xx,&x);
457:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
458:   return(0);
459: }

461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467:   IS             iscol=a->col,isrow=a->row;
469:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470:   PetscInt       *diag = a->diag;
471:   MatScalar      *aa=a->a,*v;
472:   PetscScalar    s1,*x,*b,*t;

475:   VecGetArray(bb,&b);
476:   VecGetArray(xx,&x);
477:   t  = a->solve_work;

479:   ISGetIndices(isrow,&rout); r = rout;
480:   ISGetIndices(iscol,&cout); c = cout;

482:   /* copy the b into temp work space according to permutation */
483:   for (i=0; i<n; i++) {
484:     t[i] = b[c[i]];
485:   }

487:   /* forward solve the U^T */
488:   for (i=0; i<n; i++) {

490:     v     = aa + diag[i];
491:     /* multiply by the inverse of the block diagonal */
492:     s1    = (*v++)*t[i];
493:     vi    = aj + diag[i] + 1;
494:     nz    = ai[i+1] - diag[i] - 1;
495:     while (nz--) {
496:       t[*vi++]  -= (*v++)*s1;
497:     }
498:     t[i]   = s1;
499:   }
500:   /* backward solve the L^T */
501:   for (i=n-1; i>=0; i--){
502:     v    = aa + diag[i] - 1;
503:     vi   = aj + diag[i] - 1;
504:     nz   = diag[i] - ai[i];
505:     s1   = t[i];
506:     while (nz--) {
507:       t[*vi--]   -=  (*v--)*s1;
508:     }
509:   }

511:   /* copy t into x according to permutation */
512:   for (i=0; i<n; i++) {
513:     x[r[i]]   = t[i];
514:   }

516:   ISRestoreIndices(isrow,&rout);
517:   ISRestoreIndices(iscol,&cout);
518:   VecRestoreArray(bb,&b);
519:   VecRestoreArray(xx,&x);
520:   PetscLogFlops(2*(a->nz) - A->cmap.n);
521:   return(0);
522: }

526: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527: {
528:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529:   IS             iscol=a->col,isrow=a->row;
531:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533:   MatScalar      *aa=a->a,*v;
534:   PetscScalar    s1,s2,x1,x2;
535:   PetscScalar    *x,*b,*t;

538:   VecGetArray(bb,&b);
539:   VecGetArray(xx,&x);
540:   t  = a->solve_work;

542:   ISGetIndices(isrow,&rout); r = rout;
543:   ISGetIndices(iscol,&cout); c = cout;

545:   /* copy the b into temp work space according to permutation */
546:   ii = 0;
547:   for (i=0; i<n; i++) {
548:     ic      = 2*c[i];
549:     t[ii]   = b[ic];
550:     t[ii+1] = b[ic+1];
551:     ii += 2;
552:   }

554:   /* forward solve the U^T */
555:   idx = 0;
556:   for (i=0; i<n; i++) {

558:     v     = aa + 4*diag[i];
559:     /* multiply by the inverse of the block diagonal */
560:     x1    = t[idx];   x2 = t[1+idx];
561:     s1 = v[0]*x1  +  v[1]*x2;
562:     s2 = v[2]*x1  +  v[3]*x2;
563:     v += 4;

565:     vi    = aj + diag[i] + 1;
566:     nz    = ai[i+1] - diag[i] - 1;
567:     while (nz--) {
568:       oidx = 2*(*vi++);
569:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571:       v  += 4;
572:     }
573:     t[idx]   = s1;t[1+idx] = s2;
574:     idx += 2;
575:   }
576:   /* backward solve the L^T */
577:   for (i=n-1; i>=0; i--){
578:     v    = aa + 4*diag[i] - 4;
579:     vi   = aj + diag[i] - 1;
580:     nz   = diag[i] - ai[i];
581:     idt  = 2*i;
582:     s1 = t[idt];  s2 = t[1+idt];
583:     while (nz--) {
584:       idx   = 2*(*vi--);
585:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587:       v -= 4;
588:     }
589:   }

591:   /* copy t into x according to permutation */
592:   ii = 0;
593:   for (i=0; i<n; i++) {
594:     ir      = 2*r[i];
595:     x[ir]   = t[ii];
596:     x[ir+1] = t[ii+1];
597:     ii += 2;
598:   }

600:   ISRestoreIndices(isrow,&rout);
601:   ISRestoreIndices(iscol,&cout);
602:   VecRestoreArray(bb,&b);
603:   VecRestoreArray(xx,&x);
604:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
605:   return(0);
606: }

610: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611: {
612:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613:   IS             iscol=a->col,isrow=a->row;
615:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617:   MatScalar      *aa=a->a,*v;
618:   PetscScalar    s1,s2,s3,x1,x2,x3;
619:   PetscScalar    *x,*b,*t;

622:   VecGetArray(bb,&b);
623:   VecGetArray(xx,&x);
624:   t  = a->solve_work;

626:   ISGetIndices(isrow,&rout); r = rout;
627:   ISGetIndices(iscol,&cout); c = cout;

629:   /* copy the b into temp work space according to permutation */
630:   ii = 0;
631:   for (i=0; i<n; i++) {
632:     ic      = 3*c[i];
633:     t[ii]   = b[ic];
634:     t[ii+1] = b[ic+1];
635:     t[ii+2] = b[ic+2];
636:     ii += 3;
637:   }

639:   /* forward solve the U^T */
640:   idx = 0;
641:   for (i=0; i<n; i++) {

643:     v     = aa + 9*diag[i];
644:     /* multiply by the inverse of the block diagonal */
645:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649:     v += 9;

651:     vi    = aj + diag[i] + 1;
652:     nz    = ai[i+1] - diag[i] - 1;
653:     while (nz--) {
654:       oidx = 3*(*vi++);
655:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658:       v  += 9;
659:     }
660:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661:     idx += 3;
662:   }
663:   /* backward solve the L^T */
664:   for (i=n-1; i>=0; i--){
665:     v    = aa + 9*diag[i] - 9;
666:     vi   = aj + diag[i] - 1;
667:     nz   = diag[i] - ai[i];
668:     idt  = 3*i;
669:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670:     while (nz--) {
671:       idx   = 3*(*vi--);
672:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675:       v -= 9;
676:     }
677:   }

679:   /* copy t into x according to permutation */
680:   ii = 0;
681:   for (i=0; i<n; i++) {
682:     ir      = 3*r[i];
683:     x[ir]   = t[ii];
684:     x[ir+1] = t[ii+1];
685:     x[ir+2] = t[ii+2];
686:     ii += 3;
687:   }

689:   ISRestoreIndices(isrow,&rout);
690:   ISRestoreIndices(iscol,&cout);
691:   VecRestoreArray(bb,&b);
692:   VecRestoreArray(xx,&x);
693:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
694:   return(0);
695: }

699: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700: {
701:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702:   IS             iscol=a->col,isrow=a->row;
704:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706:   MatScalar      *aa=a->a,*v;
707:   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
708:   PetscScalar    *x,*b,*t;

711:   VecGetArray(bb,&b);
712:   VecGetArray(xx,&x);
713:   t  = a->solve_work;

715:   ISGetIndices(isrow,&rout); r = rout;
716:   ISGetIndices(iscol,&cout); c = cout;

718:   /* copy the b into temp work space according to permutation */
719:   ii = 0;
720:   for (i=0; i<n; i++) {
721:     ic      = 4*c[i];
722:     t[ii]   = b[ic];
723:     t[ii+1] = b[ic+1];
724:     t[ii+2] = b[ic+2];
725:     t[ii+3] = b[ic+3];
726:     ii += 4;
727:   }

729:   /* forward solve the U^T */
730:   idx = 0;
731:   for (i=0; i<n; i++) {

733:     v     = aa + 16*diag[i];
734:     /* multiply by the inverse of the block diagonal */
735:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740:     v += 16;

742:     vi    = aj + diag[i] + 1;
743:     nz    = ai[i+1] - diag[i] - 1;
744:     while (nz--) {
745:       oidx = 4*(*vi++);
746:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750:       v  += 16;
751:     }
752:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753:     idx += 4;
754:   }
755:   /* backward solve the L^T */
756:   for (i=n-1; i>=0; i--){
757:     v    = aa + 16*diag[i] - 16;
758:     vi   = aj + diag[i] - 1;
759:     nz   = diag[i] - ai[i];
760:     idt  = 4*i;
761:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762:     while (nz--) {
763:       idx   = 4*(*vi--);
764:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768:       v -= 16;
769:     }
770:   }

772:   /* copy t into x according to permutation */
773:   ii = 0;
774:   for (i=0; i<n; i++) {
775:     ir      = 4*r[i];
776:     x[ir]   = t[ii];
777:     x[ir+1] = t[ii+1];
778:     x[ir+2] = t[ii+2];
779:     x[ir+3] = t[ii+3];
780:     ii += 4;
781:   }

783:   ISRestoreIndices(isrow,&rout);
784:   ISRestoreIndices(iscol,&cout);
785:   VecRestoreArray(bb,&b);
786:   VecRestoreArray(xx,&x);
787:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
788:   return(0);
789: }

793: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794: {
795:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796:   IS             iscol=a->col,isrow=a->row;
798:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800:   MatScalar      *aa=a->a,*v;
801:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802:   PetscScalar    *x,*b,*t;

805:   VecGetArray(bb,&b);
806:   VecGetArray(xx,&x);
807:   t  = a->solve_work;

809:   ISGetIndices(isrow,&rout); r = rout;
810:   ISGetIndices(iscol,&cout); c = cout;

812:   /* copy the b into temp work space according to permutation */
813:   ii = 0;
814:   for (i=0; i<n; i++) {
815:     ic      = 5*c[i];
816:     t[ii]   = b[ic];
817:     t[ii+1] = b[ic+1];
818:     t[ii+2] = b[ic+2];
819:     t[ii+3] = b[ic+3];
820:     t[ii+4] = b[ic+4];
821:     ii += 5;
822:   }

824:   /* forward solve the U^T */
825:   idx = 0;
826:   for (i=0; i<n; i++) {

828:     v     = aa + 25*diag[i];
829:     /* multiply by the inverse of the block diagonal */
830:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836:     v += 25;

838:     vi    = aj + diag[i] + 1;
839:     nz    = ai[i+1] - diag[i] - 1;
840:     while (nz--) {
841:       oidx = 5*(*vi++);
842:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847:       v  += 25;
848:     }
849:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850:     idx += 5;
851:   }
852:   /* backward solve the L^T */
853:   for (i=n-1; i>=0; i--){
854:     v    = aa + 25*diag[i] - 25;
855:     vi   = aj + diag[i] - 1;
856:     nz   = diag[i] - ai[i];
857:     idt  = 5*i;
858:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859:     while (nz--) {
860:       idx   = 5*(*vi--);
861:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866:       v -= 25;
867:     }
868:   }

870:   /* copy t into x according to permutation */
871:   ii = 0;
872:   for (i=0; i<n; i++) {
873:     ir      = 5*r[i];
874:     x[ir]   = t[ii];
875:     x[ir+1] = t[ii+1];
876:     x[ir+2] = t[ii+2];
877:     x[ir+3] = t[ii+3];
878:     x[ir+4] = t[ii+4];
879:     ii += 5;
880:   }

882:   ISRestoreIndices(isrow,&rout);
883:   ISRestoreIndices(iscol,&cout);
884:   VecRestoreArray(bb,&b);
885:   VecRestoreArray(xx,&x);
886:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
887:   return(0);
888: }

892: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893: {
894:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895:   IS             iscol=a->col,isrow=a->row;
897:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899:   MatScalar      *aa=a->a,*v;
900:   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901:   PetscScalar    *x,*b,*t;

904:   VecGetArray(bb,&b);
905:   VecGetArray(xx,&x);
906:   t  = a->solve_work;

908:   ISGetIndices(isrow,&rout); r = rout;
909:   ISGetIndices(iscol,&cout); c = cout;

911:   /* copy the b into temp work space according to permutation */
912:   ii = 0;
913:   for (i=0; i<n; i++) {
914:     ic      = 6*c[i];
915:     t[ii]   = b[ic];
916:     t[ii+1] = b[ic+1];
917:     t[ii+2] = b[ic+2];
918:     t[ii+3] = b[ic+3];
919:     t[ii+4] = b[ic+4];
920:     t[ii+5] = b[ic+5];
921:     ii += 6;
922:   }

924:   /* forward solve the U^T */
925:   idx = 0;
926:   for (i=0; i<n; i++) {

928:     v     = aa + 36*diag[i];
929:     /* multiply by the inverse of the block diagonal */
930:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931:     x6    = t[5+idx];
932:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938:     v += 36;

940:     vi    = aj + diag[i] + 1;
941:     nz    = ai[i+1] - diag[i] - 1;
942:     while (nz--) {
943:       oidx = 6*(*vi++);
944:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950:       v  += 36;
951:     }
952:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953:     t[5+idx] = s6;
954:     idx += 6;
955:   }
956:   /* backward solve the L^T */
957:   for (i=n-1; i>=0; i--){
958:     v    = aa + 36*diag[i] - 36;
959:     vi   = aj + diag[i] - 1;
960:     nz   = diag[i] - ai[i];
961:     idt  = 6*i;
962:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963:     s6 = t[5+idt];
964:     while (nz--) {
965:       idx   = 6*(*vi--);
966:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972:       v -= 36;
973:     }
974:   }

976:   /* copy t into x according to permutation */
977:   ii = 0;
978:   for (i=0; i<n; i++) {
979:     ir      = 6*r[i];
980:     x[ir]   = t[ii];
981:     x[ir+1] = t[ii+1];
982:     x[ir+2] = t[ii+2];
983:     x[ir+3] = t[ii+3];
984:     x[ir+4] = t[ii+4];
985:     x[ir+5] = t[ii+5];
986:     ii += 6;
987:   }

989:   ISRestoreIndices(isrow,&rout);
990:   ISRestoreIndices(iscol,&cout);
991:   VecRestoreArray(bb,&b);
992:   VecRestoreArray(xx,&x);
993:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
994:   return(0);
995: }

999: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000: {
1001:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002:   IS             iscol=a->col,isrow=a->row;
1004:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005:   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006:   MatScalar      *aa=a->a,*v;
1007:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008:   PetscScalar    *x,*b,*t;

1011:   VecGetArray(bb,&b);
1012:   VecGetArray(xx,&x);
1013:   t  = a->solve_work;

1015:   ISGetIndices(isrow,&rout); r = rout;
1016:   ISGetIndices(iscol,&cout); c = cout;

1018:   /* copy the b into temp work space according to permutation */
1019:   ii = 0;
1020:   for (i=0; i<n; i++) {
1021:     ic      = 7*c[i];
1022:     t[ii]   = b[ic];
1023:     t[ii+1] = b[ic+1];
1024:     t[ii+2] = b[ic+2];
1025:     t[ii+3] = b[ic+3];
1026:     t[ii+4] = b[ic+4];
1027:     t[ii+5] = b[ic+5];
1028:     t[ii+6] = b[ic+6];
1029:     ii += 7;
1030:   }

1032:   /* forward solve the U^T */
1033:   idx = 0;
1034:   for (i=0; i<n; i++) {

1036:     v     = aa + 49*diag[i];
1037:     /* multiply by the inverse of the block diagonal */
1038:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039:     x6    = t[5+idx]; x7 = t[6+idx];
1040:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047:     v += 49;

1049:     vi    = aj + diag[i] + 1;
1050:     nz    = ai[i+1] - diag[i] - 1;
1051:     while (nz--) {
1052:       oidx = 7*(*vi++);
1053:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060:       v  += 49;
1061:     }
1062:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063:     t[5+idx] = s6;t[6+idx] = s7;
1064:     idx += 7;
1065:   }
1066:   /* backward solve the L^T */
1067:   for (i=n-1; i>=0; i--){
1068:     v    = aa + 49*diag[i] - 49;
1069:     vi   = aj + diag[i] - 1;
1070:     nz   = diag[i] - ai[i];
1071:     idt  = 7*i;
1072:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073:     s6 = t[5+idt];s7 = t[6+idt];
1074:     while (nz--) {
1075:       idx   = 7*(*vi--);
1076:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083:       v -= 49;
1084:     }
1085:   }

1087:   /* copy t into x according to permutation */
1088:   ii = 0;
1089:   for (i=0; i<n; i++) {
1090:     ir      = 7*r[i];
1091:     x[ir]   = t[ii];
1092:     x[ir+1] = t[ii+1];
1093:     x[ir+2] = t[ii+2];
1094:     x[ir+3] = t[ii+3];
1095:     x[ir+4] = t[ii+4];
1096:     x[ir+5] = t[ii+5];
1097:     x[ir+6] = t[ii+6];
1098:     ii += 7;
1099:   }

1101:   ISRestoreIndices(isrow,&rout);
1102:   ISRestoreIndices(iscol,&cout);
1103:   VecRestoreArray(bb,&b);
1104:   VecRestoreArray(xx,&x);
1105:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1106:   return(0);
1107: }

1109: /* ----------------------------------------------------------- */
1112: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113: {
1114:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1115:   IS             iscol=a->col,isrow=a->row;
1117:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118:   PetscInt       nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout;
1119:   MatScalar      *aa=a->a,*v;
1120:   PetscScalar    *x,*b,*s,*t,*ls;

1123:   VecGetArray(bb,&b);
1124:   VecGetArray(xx,&x);
1125:   t  = a->solve_work;

1127:   ISGetIndices(isrow,&rout); r = rout;
1128:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1130:   /* forward solve the lower triangular */
1131:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1132:   for (i=1; i<n; i++) {
1133:     v   = aa + bs2*ai[i];
1134:     vi  = aj + ai[i];
1135:     nz  = a->diag[i] - ai[i];
1136:     s = t + bs*i;
1137:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1138:     while (nz--) {
1139:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140:       v += bs2;
1141:     }
1142:   }
1143:   /* backward solve the upper triangular */
1144:   ls = a->solve_work + A->cmap.n;
1145:   for (i=n-1; i>=0; i--){
1146:     v   = aa + bs2*(a->diag[i] + 1);
1147:     vi  = aj + a->diag[i] + 1;
1148:     nz  = ai[i+1] - a->diag[i] - 1;
1149:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1150:     while (nz--) {
1151:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152:       v += bs2;
1153:     }
1154:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1156:   }

1158:   ISRestoreIndices(isrow,&rout);
1159:   ISRestoreIndices(iscol,&cout);
1160:   VecRestoreArray(bb,&b);
1161:   VecRestoreArray(xx,&x);
1162:   PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);
1163:   return(0);
1164: }

1168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169: {
1170:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1171:   IS             iscol=a->col,isrow=a->row;
1173:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174:   PetscInt       *diag = a->diag;
1175:   MatScalar      *aa=a->a,*v;
1176:   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177:   PetscScalar    *x,*b,*t;

1180:   VecGetArray(bb,&b);
1181:   VecGetArray(xx,&x);
1182:   t  = a->solve_work;

1184:   ISGetIndices(isrow,&rout); r = rout;
1185:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1187:   /* forward solve the lower triangular */
1188:   idx    = 7*(*r++);
1189:   t[0] = b[idx];   t[1] = b[1+idx];
1190:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191:   t[5] = b[5+idx]; t[6] = b[6+idx];

1193:   for (i=1; i<n; i++) {
1194:     v     = aa + 49*ai[i];
1195:     vi    = aj + ai[i];
1196:     nz    = diag[i] - ai[i];
1197:     idx   = 7*(*r++);
1198:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200:     while (nz--) {
1201:       idx   = 7*(*vi++);
1202:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203:       x4    = t[3+idx];x5 = t[4+idx];
1204:       x6    = t[5+idx];x7 = t[6+idx];
1205:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212:       v += 49;
1213:     }
1214:     idx = 7*i;
1215:     t[idx]   = s1;t[1+idx] = s2;
1216:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217:     t[5+idx] = s6;t[6+idx] = s7;
1218:   }
1219:   /* backward solve the upper triangular */
1220:   for (i=n-1; i>=0; i--){
1221:     v    = aa + 49*diag[i] + 49;
1222:     vi   = aj + diag[i] + 1;
1223:     nz   = ai[i+1] - diag[i] - 1;
1224:     idt  = 7*i;
1225:     s1 = t[idt];  s2 = t[1+idt];
1226:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227:     s6 = t[5+idt];s7 = t[6+idt];
1228:     while (nz--) {
1229:       idx   = 7*(*vi++);
1230:       x1    = t[idx];   x2 = t[1+idx];
1231:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232:       x6    = t[5+idx]; x7 = t[6+idx];
1233:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240:       v += 49;
1241:     }
1242:     idc = 7*(*c--);
1243:     v   = aa + 49*diag[i];
1244:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258:   }

1260:   ISRestoreIndices(isrow,&rout);
1261:   ISRestoreIndices(iscol,&cout);
1262:   VecRestoreArray(bb,&b);
1263:   VecRestoreArray(xx,&x);
1264:   PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1265:   return(0);
1266: }

1270: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271: {
1272:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1273:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274:   PetscErrorCode    ierr;
1275:   PetscInt          *diag = a->diag,jdx;
1276:   const MatScalar   *aa=a->a,*v;
1277:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278:   const PetscScalar *b;

1281:   VecGetArray(bb,(PetscScalar**)&b);
1282:   VecGetArray(xx,&x);
1283:   /* forward solve the lower triangular */
1284:   idx    = 0;
1285:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1286:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1287:   x[6] = b[6+idx];
1288:   for (i=1; i<n; i++) {
1289:     v     =  aa + 49*ai[i];
1290:     vi    =  aj + ai[i];
1291:     nz    =  diag[i] - ai[i];
1292:     idx   =  7*i;
1293:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1294:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1295:     s7  =  b[6+idx];
1296:     while (nz--) {
1297:       jdx   = 7*(*vi++);
1298:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1299:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1300:       x7    = x[6+jdx];
1301:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1302:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1303:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1304:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1305:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1306:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1307:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1308:       v += 49;
1309:      }
1310:     x[idx]   = s1;
1311:     x[1+idx] = s2;
1312:     x[2+idx] = s3;
1313:     x[3+idx] = s4;
1314:     x[4+idx] = s5;
1315:     x[5+idx] = s6;
1316:     x[6+idx] = s7;
1317:   }
1318:   /* backward solve the upper triangular */
1319:   for (i=n-1; i>=0; i--){
1320:     v    = aa + 49*diag[i] + 49;
1321:     vi   = aj + diag[i] + 1;
1322:     nz   = ai[i+1] - diag[i] - 1;
1323:     idt  = 7*i;
1324:     s1 = x[idt];   s2 = x[1+idt];
1325:     s3 = x[2+idt]; s4 = x[3+idt];
1326:     s5 = x[4+idt]; s6 = x[5+idt];
1327:     s7 = x[6+idt];
1328:     while (nz--) {
1329:       idx   = 7*(*vi++);
1330:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1331:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1332:       x7    = x[6+idx];
1333:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1334:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1335:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1336:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1337:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1338:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1339:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1340:       v += 49;
1341:     }
1342:     v        = aa + 49*diag[i];
1343:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1344:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1345:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1346:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1347:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1348:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1349:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1350:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1351:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1352:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1353:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1354:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1355:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1356:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1357:   }

1359:   VecRestoreArray(bb,(PetscScalar**)&b);
1360:   VecRestoreArray(xx,&x);
1361:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1362:   return(0);
1363: }

1367: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1368: {
1369:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1370:   IS                iscol=a->col,isrow=a->row;
1371:   PetscErrorCode    ierr;
1372:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1373:   PetscInt          *diag = a->diag;
1374:   const MatScalar   *aa=a->a,*v;
1375:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1376:   const PetscScalar *b;
1378:   VecGetArray(bb,(PetscScalar**)&b);
1379:   VecGetArray(xx,&x);
1380:   t  = a->solve_work;

1382:   ISGetIndices(isrow,&rout); r = rout;
1383:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1385:   /* forward solve the lower triangular */
1386:   idx    = 6*(*r++);
1387:   t[0] = b[idx];   t[1] = b[1+idx];
1388:   t[2] = b[2+idx]; t[3] = b[3+idx];
1389:   t[4] = b[4+idx]; t[5] = b[5+idx];
1390:   for (i=1; i<n; i++) {
1391:     v     = aa + 36*ai[i];
1392:     vi    = aj + ai[i];
1393:     nz    = diag[i] - ai[i];
1394:     idx   = 6*(*r++);
1395:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1396:     s5  = b[4+idx]; s6 = b[5+idx];
1397:     while (nz--) {
1398:       idx   = 6*(*vi++);
1399:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1400:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1401:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1402:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1403:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1404:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1405:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1406:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1407:       v += 36;
1408:     }
1409:     idx = 6*i;
1410:     t[idx]   = s1;t[1+idx] = s2;
1411:     t[2+idx] = s3;t[3+idx] = s4;
1412:     t[4+idx] = s5;t[5+idx] = s6;
1413:   }
1414:   /* backward solve the upper triangular */
1415:   for (i=n-1; i>=0; i--){
1416:     v    = aa + 36*diag[i] + 36;
1417:     vi   = aj + diag[i] + 1;
1418:     nz   = ai[i+1] - diag[i] - 1;
1419:     idt  = 6*i;
1420:     s1 = t[idt];  s2 = t[1+idt];
1421:     s3 = t[2+idt];s4 = t[3+idt];
1422:     s5 = t[4+idt];s6 = t[5+idt];
1423:     while (nz--) {
1424:       idx   = 6*(*vi++);
1425:       x1    = t[idx];   x2 = t[1+idx];
1426:       x3    = t[2+idx]; x4 = t[3+idx];
1427:       x5    = t[4+idx]; x6 = t[5+idx];
1428:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1429:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1430:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1431:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1432:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1433:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1434:       v += 36;
1435:     }
1436:     idc = 6*(*c--);
1437:     v   = aa + 36*diag[i];
1438:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1439:                                  v[18]*s4+v[24]*s5+v[30]*s6;
1440:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1441:                                  v[19]*s4+v[25]*s5+v[31]*s6;
1442:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1443:                                  v[20]*s4+v[26]*s5+v[32]*s6;
1444:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1445:                                  v[21]*s4+v[27]*s5+v[33]*s6;
1446:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1447:                                  v[22]*s4+v[28]*s5+v[34]*s6;
1448:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1449:                                  v[23]*s4+v[29]*s5+v[35]*s6;
1450:   }

1452:   ISRestoreIndices(isrow,&rout);
1453:   ISRestoreIndices(iscol,&cout);
1454:   VecRestoreArray(bb,(PetscScalar**)&b);
1455:   VecRestoreArray(xx,&x);
1456:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1457:   return(0);
1458: }

1462: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1463: {
1464:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1465:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466:   PetscErrorCode    ierr;
1467:   PetscInt          *diag = a->diag,jdx;
1468:   const MatScalar   *aa=a->a,*v;
1469:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1470:   const PetscScalar *b;

1473:   VecGetArray(bb,(PetscScalar**)&b);
1474:   VecGetArray(xx,&x);
1475:   /* forward solve the lower triangular */
1476:   idx    = 0;
1477:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1478:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1479:   for (i=1; i<n; i++) {
1480:     v     =  aa + 36*ai[i];
1481:     vi    =  aj + ai[i];
1482:     nz    =  diag[i] - ai[i];
1483:     idx   =  6*i;
1484:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1485:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1486:     while (nz--) {
1487:       jdx   = 6*(*vi++);
1488:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1489:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1490:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1496:       v += 36;
1497:      }
1498:     x[idx]   = s1;
1499:     x[1+idx] = s2;
1500:     x[2+idx] = s3;
1501:     x[3+idx] = s4;
1502:     x[4+idx] = s5;
1503:     x[5+idx] = s6;
1504:   }
1505:   /* backward solve the upper triangular */
1506:   for (i=n-1; i>=0; i--){
1507:     v    = aa + 36*diag[i] + 36;
1508:     vi   = aj + diag[i] + 1;
1509:     nz   = ai[i+1] - diag[i] - 1;
1510:     idt  = 6*i;
1511:     s1 = x[idt];   s2 = x[1+idt];
1512:     s3 = x[2+idt]; s4 = x[3+idt];
1513:     s5 = x[4+idt]; s6 = x[5+idt];
1514:     while (nz--) {
1515:       idx   = 6*(*vi++);
1516:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1517:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1518:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1519:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1520:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1521:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1522:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1523:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1524:       v += 36;
1525:     }
1526:     v        = aa + 36*diag[i];
1527:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1528:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1529:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1530:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1531:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1532:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1533:   }

1535:   VecRestoreArray(bb,(PetscScalar**)&b);
1536:   VecRestoreArray(xx,&x);
1537:   PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1538:   return(0);
1539: }

1543: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1544: {
1545:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1546:   IS                iscol=a->col,isrow=a->row;
1547:   PetscErrorCode    ierr;
1548:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1549:   PetscInt          *diag = a->diag;
1550:   const MatScalar   *aa=a->a,*v;
1551:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552:   const PetscScalar *b;

1555:   VecGetArray(bb,(PetscScalar**)&b);
1556:   VecGetArray(xx,&x);
1557:   t  = a->solve_work;

1559:   ISGetIndices(isrow,&rout); r = rout;
1560:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1562:   /* forward solve the lower triangular */
1563:   idx    = 5*(*r++);
1564:   t[0] = b[idx];   t[1] = b[1+idx];
1565:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1566:   for (i=1; i<n; i++) {
1567:     v     = aa + 25*ai[i];
1568:     vi    = aj + ai[i];
1569:     nz    = diag[i] - ai[i];
1570:     idx   = 5*(*r++);
1571:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1572:     s5  = b[4+idx];
1573:     while (nz--) {
1574:       idx   = 5*(*vi++);
1575:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1576:       x4    = t[3+idx];x5 = t[4+idx];
1577:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1582:       v += 25;
1583:     }
1584:     idx = 5*i;
1585:     t[idx]   = s1;t[1+idx] = s2;
1586:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1587:   }
1588:   /* backward solve the upper triangular */
1589:   for (i=n-1; i>=0; i--){
1590:     v    = aa + 25*diag[i] + 25;
1591:     vi   = aj + diag[i] + 1;
1592:     nz   = ai[i+1] - diag[i] - 1;
1593:     idt  = 5*i;
1594:     s1 = t[idt];  s2 = t[1+idt];
1595:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1596:     while (nz--) {
1597:       idx   = 5*(*vi++);
1598:       x1    = t[idx];   x2 = t[1+idx];
1599:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1600:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1601:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1602:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1603:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1604:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1605:       v += 25;
1606:     }
1607:     idc = 5*(*c--);
1608:     v   = aa + 25*diag[i];
1609:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1610:                                  v[15]*s4+v[20]*s5;
1611:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1612:                                  v[16]*s4+v[21]*s5;
1613:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1614:                                  v[17]*s4+v[22]*s5;
1615:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1616:                                  v[18]*s4+v[23]*s5;
1617:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1618:                                  v[19]*s4+v[24]*s5;
1619:   }

1621:   ISRestoreIndices(isrow,&rout);
1622:   ISRestoreIndices(iscol,&cout);
1623:   VecRestoreArray(bb,(PetscScalar**)&b);
1624:   VecRestoreArray(xx,&x);
1625:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1626:   return(0);
1627: }

1631: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632: {
1633:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1634:   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1635:   PetscErrorCode    ierr;
1636:   PetscInt          *diag = a->diag,jdx;
1637:   const MatScalar   *aa=a->a,*v;
1638:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1639:   const PetscScalar *b;

1642:   VecGetArray(bb,(PetscScalar**)&b);
1643:   VecGetArray(xx,&x);
1644:   /* forward solve the lower triangular */
1645:   idx    = 0;
1646:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1647:   for (i=1; i<n; i++) {
1648:     v     =  aa + 25*ai[i];
1649:     vi    =  aj + ai[i];
1650:     nz    =  diag[i] - ai[i];
1651:     idx   =  5*i;
1652:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1653:     while (nz--) {
1654:       jdx   = 5*(*vi++);
1655:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1656:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1657:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1658:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1659:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1660:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1661:       v    += 25;
1662:     }
1663:     x[idx]   = s1;
1664:     x[1+idx] = s2;
1665:     x[2+idx] = s3;
1666:     x[3+idx] = s4;
1667:     x[4+idx] = s5;
1668:   }
1669:   /* backward solve the upper triangular */
1670:   for (i=n-1; i>=0; i--){
1671:     v    = aa + 25*diag[i] + 25;
1672:     vi   = aj + diag[i] + 1;
1673:     nz   = ai[i+1] - diag[i] - 1;
1674:     idt  = 5*i;
1675:     s1 = x[idt];  s2 = x[1+idt];
1676:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1677:     while (nz--) {
1678:       idx   = 5*(*vi++);
1679:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1680:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1681:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1682:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1683:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1684:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1685:       v    += 25;
1686:     }
1687:     v        = aa + 25*diag[i];
1688:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1689:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1690:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1691:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1692:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1693:   }

1695:   VecRestoreArray(bb,(PetscScalar**)&b);
1696:   VecRestoreArray(xx,&x);
1697:   PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1698:   return(0);
1699: }

1703: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1704: {
1705:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1706:   IS                iscol=a->col,isrow=a->row;
1707:   PetscErrorCode    ierr;
1708:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1709:   PetscInt          *diag = a->diag;
1710:   const MatScalar   *aa=a->a,*v;
1711:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1712:   const PetscScalar *b;

1715:   VecGetArray(bb,(PetscScalar**)&b);
1716:   VecGetArray(xx,&x);
1717:   t  = a->solve_work;

1719:   ISGetIndices(isrow,&rout); r = rout;
1720:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1722:   /* forward solve the lower triangular */
1723:   idx    = 4*(*r++);
1724:   t[0] = b[idx];   t[1] = b[1+idx];
1725:   t[2] = b[2+idx]; t[3] = b[3+idx];
1726:   for (i=1; i<n; i++) {
1727:     v     = aa + 16*ai[i];
1728:     vi    = aj + ai[i];
1729:     nz    = diag[i] - ai[i];
1730:     idx   = 4*(*r++);
1731:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1732:     while (nz--) {
1733:       idx   = 4*(*vi++);
1734:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1735:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1736:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1737:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1738:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1739:       v    += 16;
1740:     }
1741:     idx        = 4*i;
1742:     t[idx]   = s1;t[1+idx] = s2;
1743:     t[2+idx] = s3;t[3+idx] = s4;
1744:   }
1745:   /* backward solve the upper triangular */
1746:   for (i=n-1; i>=0; i--){
1747:     v    = aa + 16*diag[i] + 16;
1748:     vi   = aj + diag[i] + 1;
1749:     nz   = ai[i+1] - diag[i] - 1;
1750:     idt  = 4*i;
1751:     s1 = t[idt];  s2 = t[1+idt];
1752:     s3 = t[2+idt];s4 = t[3+idt];
1753:     while (nz--) {
1754:       idx   = 4*(*vi++);
1755:       x1    = t[idx];   x2 = t[1+idx];
1756:       x3    = t[2+idx]; x4 = t[3+idx];
1757:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1758:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1759:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1760:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1761:       v += 16;
1762:     }
1763:     idc      = 4*(*c--);
1764:     v        = aa + 16*diag[i];
1765:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1766:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1767:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1768:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1769:   }

1771:   ISRestoreIndices(isrow,&rout);
1772:   ISRestoreIndices(iscol,&cout);
1773:   VecRestoreArray(bb,(PetscScalar**)&b);
1774:   VecRestoreArray(xx,&x);
1775:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1776:   return(0);
1777: }

1781: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1782: {
1783:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784:   IS                iscol=a->col,isrow=a->row;
1785:   PetscErrorCode    ierr;
1786:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1787:   PetscInt          *diag = a->diag;
1788:   const MatScalar   *aa=a->a,*v;
1789:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790:   PetscScalar       *x;
1791:   const PetscScalar *b;

1794:   VecGetArray(bb,(PetscScalar**)&b);
1795:   VecGetArray(xx,&x);
1796:   t  = (MatScalar *)a->solve_work;

1798:   ISGetIndices(isrow,&rout); r = rout;
1799:   ISGetIndices(iscol,&cout); c = cout + (n-1);

1801:   /* forward solve the lower triangular */
1802:   idx    = 4*(*r++);
1803:   t[0] = (MatScalar)b[idx];
1804:   t[1] = (MatScalar)b[1+idx];
1805:   t[2] = (MatScalar)b[2+idx];
1806:   t[3] = (MatScalar)b[3+idx];
1807:   for (i=1; i<n; i++) {
1808:     v     = aa + 16*ai[i];
1809:     vi    = aj + ai[i];
1810:     nz    = diag[i] - ai[i];
1811:     idx   = 4*(*r++);
1812:     s1 = (MatScalar)b[idx];
1813:     s2 = (MatScalar)b[1+idx];
1814:     s3 = (MatScalar)b[2+idx];
1815:     s4 = (MatScalar)b[3+idx];
1816:     while (nz--) {
1817:       idx   = 4*(*vi++);
1818:       x1  = t[idx];
1819:       x2  = t[1+idx];
1820:       x3  = t[2+idx];
1821:       x4  = t[3+idx];
1822:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1823:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1824:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1825:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1826:       v    += 16;
1827:     }
1828:     idx        = 4*i;
1829:     t[idx]   = s1;
1830:     t[1+idx] = s2;
1831:     t[2+idx] = s3;
1832:     t[3+idx] = s4;
1833:   }
1834:   /* backward solve the upper triangular */
1835:   for (i=n-1; i>=0; i--){
1836:     v    = aa + 16*diag[i] + 16;
1837:     vi   = aj + diag[i] + 1;
1838:     nz   = ai[i+1] - diag[i] - 1;
1839:     idt  = 4*i;
1840:     s1 = t[idt];
1841:     s2 = t[1+idt];
1842:     s3 = t[2+idt];
1843:     s4 = t[3+idt];
1844:     while (nz--) {
1845:       idx   = 4*(*vi++);
1846:       x1  = t[idx];
1847:       x2  = t[1+idx];
1848:       x3  = t[2+idx];
1849:       x4  = t[3+idx];
1850:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1851:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1852:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1853:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1854:       v += 16;
1855:     }
1856:     idc      = 4*(*c--);
1857:     v        = aa + 16*diag[i];
1858:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1859:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1860:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1861:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1862:     x[idc]   = (PetscScalar)t[idt];
1863:     x[1+idc] = (PetscScalar)t[1+idt];
1864:     x[2+idc] = (PetscScalar)t[2+idt];
1865:     x[3+idc] = (PetscScalar)t[3+idt];
1866:  }

1868:   ISRestoreIndices(isrow,&rout);
1869:   ISRestoreIndices(iscol,&cout);
1870:   VecRestoreArray(bb,(PetscScalar**)&b);
1871:   VecRestoreArray(xx,&x);
1872:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1873:   return(0);
1874: }

1876: #if defined (PETSC_HAVE_SSE)

1878: #include PETSC_HAVE_SSE

1882: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1883: {
1884:   /* 
1885:      Note: This code uses demotion of double
1886:      to float when performing the mixed-mode computation.
1887:      This may not be numerically reasonable for all applications.
1888:   */
1889:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
1890:   IS             iscol=a->col,isrow=a->row;
1892:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1893:   PetscInt       *diag = a->diag,ai16;
1894:   MatScalar      *aa=a->a,*v;
1895:   PetscScalar    *x,*b,*t;

1897:   /* Make space in temp stack for 16 Byte Aligned arrays */
1898:   float           ssealignedspace[11],*tmps,*tmpx;
1899:   unsigned long   offset;
1900: 
1902:   SSE_SCOPE_BEGIN;

1904:     offset = (unsigned long)ssealignedspace % 16;
1905:     if (offset) offset = (16 - offset)/4;
1906:     tmps = &ssealignedspace[offset];
1907:     tmpx = &ssealignedspace[offset+4];
1908:     PREFETCH_NTA(aa+16*ai[1]);

1910:     VecGetArray(bb,&b);
1911:     VecGetArray(xx,&x);
1912:     t  = a->solve_work;

1914:     ISGetIndices(isrow,&rout); r = rout;
1915:     ISGetIndices(iscol,&cout); c = cout + (n-1);

1917:     /* forward solve the lower triangular */
1918:     idx  = 4*(*r++);
1919:     t[0] = b[idx];   t[1] = b[1+idx];
1920:     t[2] = b[2+idx]; t[3] = b[3+idx];
1921:     v    =  aa + 16*ai[1];

1923:     for (i=1; i<n;) {
1924:       PREFETCH_NTA(&v[8]);
1925:       vi   =  aj      + ai[i];
1926:       nz   =  diag[i] - ai[i];
1927:       idx  =  4*(*r++);

1929:       /* Demote sum from double to float */
1930:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1931:       LOAD_PS(tmps,XMM7);

1933:       while (nz--) {
1934:         PREFETCH_NTA(&v[16]);
1935:         idx = 4*(*vi++);
1936: 
1937:         /* Demote solution (so far) from double to float */
1938:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

1940:         /* 4x4 Matrix-Vector product with negative accumulation: */
1941:         SSE_INLINE_BEGIN_2(tmpx,v)
1942:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

1944:           /* First Column */
1945:           SSE_COPY_PS(XMM0,XMM6)
1946:           SSE_SHUFFLE(XMM0,XMM0,0x00)
1947:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1948:           SSE_SUB_PS(XMM7,XMM0)
1949: 
1950:           /* Second Column */
1951:           SSE_COPY_PS(XMM1,XMM6)
1952:           SSE_SHUFFLE(XMM1,XMM1,0x55)
1953:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1954:           SSE_SUB_PS(XMM7,XMM1)
1955: 
1956:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1957: 
1958:           /* Third Column */
1959:           SSE_COPY_PS(XMM2,XMM6)
1960:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
1961:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1962:           SSE_SUB_PS(XMM7,XMM2)

1964:           /* Fourth Column */
1965:           SSE_COPY_PS(XMM3,XMM6)
1966:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
1967:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1968:           SSE_SUB_PS(XMM7,XMM3)
1969:         SSE_INLINE_END_2
1970: 
1971:         v  += 16;
1972:       }
1973:       idx = 4*i;
1974:       v   = aa + 16*ai[++i];
1975:       PREFETCH_NTA(v);
1976:       STORE_PS(tmps,XMM7);

1978:       /* Promote result from float to double */
1979:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1980:     }
1981:     /* backward solve the upper triangular */
1982:     idt  = 4*(n-1);
1983:     ai16 = 16*diag[n-1];
1984:     v    = aa + ai16 + 16;
1985:     for (i=n-1; i>=0;){
1986:       PREFETCH_NTA(&v[8]);
1987:       vi = aj + diag[i] + 1;
1988:       nz = ai[i+1] - diag[i] - 1;
1989: 
1990:       /* Demote accumulator from double to float */
1991:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1992:       LOAD_PS(tmps,XMM7);

1994:       while (nz--) {
1995:         PREFETCH_NTA(&v[16]);
1996:         idx = 4*(*vi++);

1998:         /* Demote solution (so far) from double to float */
1999:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

2001:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2002:         SSE_INLINE_BEGIN_2(tmpx,v)
2003:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2005:           /* First Column */
2006:           SSE_COPY_PS(XMM0,XMM6)
2007:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2008:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2009:           SSE_SUB_PS(XMM7,XMM0)

2011:           /* Second Column */
2012:           SSE_COPY_PS(XMM1,XMM6)
2013:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2014:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2015:           SSE_SUB_PS(XMM7,XMM1)

2017:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2018: 
2019:           /* Third Column */
2020:           SSE_COPY_PS(XMM2,XMM6)
2021:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2022:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2023:           SSE_SUB_PS(XMM7,XMM2)

2025:           /* Fourth Column */
2026:           SSE_COPY_PS(XMM3,XMM6)
2027:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2028:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2029:           SSE_SUB_PS(XMM7,XMM3)
2030:         SSE_INLINE_END_2
2031:         v  += 16;
2032:       }
2033:       v    = aa + ai16;
2034:       ai16 = 16*diag[--i];
2035:       PREFETCH_NTA(aa+ai16+16);
2036:       /* 
2037:          Scale the result by the diagonal 4x4 block, 
2038:          which was inverted as part of the factorization
2039:       */
2040:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2041:         /* First Column */
2042:         SSE_COPY_PS(XMM0,XMM7)
2043:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2044:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2046:         /* Second Column */
2047:         SSE_COPY_PS(XMM1,XMM7)
2048:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2049:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2050:         SSE_ADD_PS(XMM0,XMM1)

2052:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2053: 
2054:         /* Third Column */
2055:         SSE_COPY_PS(XMM2,XMM7)
2056:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2057:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2058:         SSE_ADD_PS(XMM0,XMM2)

2060:         /* Fourth Column */
2061:         SSE_COPY_PS(XMM3,XMM7)
2062:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2063:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2064:         SSE_ADD_PS(XMM0,XMM3)
2065: 
2066:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2067:       SSE_INLINE_END_3

2069:       /* Promote solution from float to double */
2070:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

2072:       /* Apply reordering to t and stream into x.    */
2073:       /* This way, x doesn't pollute the cache.      */
2074:       /* Be careful with size: 2 doubles = 4 floats! */
2075:       idc  = 4*(*c--);
2076:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2077:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2078:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2079:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2080:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2081:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2082:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2083:       SSE_INLINE_END_2
2084:       v    = aa + ai16 + 16;
2085:       idt -= 4;
2086:     }

2088:     ISRestoreIndices(isrow,&rout);
2089:     ISRestoreIndices(iscol,&cout);
2090:     VecRestoreArray(bb,&b);
2091:     VecRestoreArray(xx,&x);
2092:     PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2093:   SSE_SCOPE_END;
2094:   return(0);
2095: }

2097: #endif


2100: /*
2101:       Special case where the matrix was ILU(0) factored in the natural
2102:    ordering. This eliminates the need for the column and row permutation.
2103: */
2106: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2107: {
2108:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2109:   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2110:   PetscErrorCode    ierr;
2111:   PetscInt          *diag = a->diag;
2112:   const MatScalar   *aa=a->a;
2113:   PetscScalar       *x;
2114:   const PetscScalar *b;

2117:   VecGetArray(bb,(PetscScalar**)&b);
2118:   VecGetArray(xx,&x);

2120: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2121:   {
2122:     static PetscScalar w[2000]; /* very BAD need to fix */
2123:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2124:   }
2125: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2126:   {
2127:     static PetscScalar w[2000]; /* very BAD need to fix */
2128:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2129:   }
2130: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2131:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2132: #else
2133:   {
2134:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2135:     const MatScalar *v;
2136:     PetscInt        jdx,idt,idx,nz,*vi,i,ai16;

2138:   /* forward solve the lower triangular */
2139:   idx    = 0;
2140:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2141:   for (i=1; i<n; i++) {
2142:     v     =  aa      + 16*ai[i];
2143:     vi    =  aj      + ai[i];
2144:     nz    =  diag[i] - ai[i];
2145:     idx   +=  4;
2146:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2147:     while (nz--) {
2148:       jdx   = 4*(*vi++);
2149:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2150:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2151:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2152:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2153:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2154:       v    += 16;
2155:     }
2156:     x[idx]   = s1;
2157:     x[1+idx] = s2;
2158:     x[2+idx] = s3;
2159:     x[3+idx] = s4;
2160:   }
2161:   /* backward solve the upper triangular */
2162:   idt = 4*(n-1);
2163:   for (i=n-1; i>=0; i--){
2164:     ai16 = 16*diag[i];
2165:     v    = aa + ai16 + 16;
2166:     vi   = aj + diag[i] + 1;
2167:     nz   = ai[i+1] - diag[i] - 1;
2168:     s1 = x[idt];  s2 = x[1+idt];
2169:     s3 = x[2+idt];s4 = x[3+idt];
2170:     while (nz--) {
2171:       idx   = 4*(*vi++);
2172:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2173:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2174:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2175:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2176:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2177:       v    += 16;
2178:     }
2179:     v        = aa + ai16;
2180:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2181:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2182:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2183:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2184:     idt -= 4;
2185:   }
2186:   }
2187: #endif

2189:   VecRestoreArray(bb,(PetscScalar**)&b);
2190:   VecRestoreArray(xx,&x);
2191:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2192:   return(0);
2193: }

2197: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2198: {
2199:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2200:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2202:   PetscInt       *diag = a->diag;
2203:   MatScalar      *aa=a->a;
2204:   PetscScalar    *x,*b;

2207:   VecGetArray(bb,&b);
2208:   VecGetArray(xx,&x);

2210:   {
2211:     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2212:     MatScalar  *v,*t=(MatScalar *)x;
2213:     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;

2215:     /* forward solve the lower triangular */
2216:     idx  = 0;
2217:     t[0] = (MatScalar)b[0];
2218:     t[1] = (MatScalar)b[1];
2219:     t[2] = (MatScalar)b[2];
2220:     t[3] = (MatScalar)b[3];
2221:     for (i=1; i<n; i++) {
2222:       v     =  aa      + 16*ai[i];
2223:       vi    =  aj      + ai[i];
2224:       nz    =  diag[i] - ai[i];
2225:       idx   +=  4;
2226:       s1 = (MatScalar)b[idx];
2227:       s2 = (MatScalar)b[1+idx];
2228:       s3 = (MatScalar)b[2+idx];
2229:       s4 = (MatScalar)b[3+idx];
2230:       while (nz--) {
2231:         jdx = 4*(*vi++);
2232:         x1  = t[jdx];
2233:         x2  = t[1+jdx];
2234:         x3  = t[2+jdx];
2235:         x4  = t[3+jdx];
2236:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2237:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2238:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2239:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2240:         v    += 16;
2241:       }
2242:       t[idx]   = s1;
2243:       t[1+idx] = s2;
2244:       t[2+idx] = s3;
2245:       t[3+idx] = s4;
2246:     }
2247:     /* backward solve the upper triangular */
2248:     idt = 4*(n-1);
2249:     for (i=n-1; i>=0; i--){
2250:       ai16 = 16*diag[i];
2251:       v    = aa + ai16 + 16;
2252:       vi   = aj + diag[i] + 1;
2253:       nz   = ai[i+1] - diag[i] - 1;
2254:       s1   = t[idt];
2255:       s2   = t[1+idt];
2256:       s3   = t[2+idt];
2257:       s4   = t[3+idt];
2258:       while (nz--) {
2259:         idx = 4*(*vi++);
2260:         x1  = (MatScalar)x[idx];
2261:         x2  = (MatScalar)x[1+idx];
2262:         x3  = (MatScalar)x[2+idx];
2263:         x4  = (MatScalar)x[3+idx];
2264:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2265:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2266:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2267:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2268:         v    += 16;
2269:       }
2270:       v        = aa + ai16;
2271:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2272:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2273:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2274:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2275:       idt -= 4;
2276:     }
2277:   }

2279:   VecRestoreArray(bb,&b);
2280:   VecRestoreArray(xx,&x);
2281:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2282:   return(0);
2283: }

2285: #if defined (PETSC_HAVE_SSE)

2287: #include PETSC_HAVE_SSE
2290: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2291: {
2292:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2293:   unsigned short *aj=(unsigned short *)a->j;
2295:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2296:   MatScalar      *aa=a->a;
2297:   PetscScalar    *x,*b;

2300:   SSE_SCOPE_BEGIN;
2301:   /* 
2302:      Note: This code currently uses demotion of double
2303:      to float when performing the mixed-mode computation.
2304:      This may not be numerically reasonable for all applications.
2305:   */
2306:   PREFETCH_NTA(aa+16*ai[1]);

2308:   VecGetArray(bb,&b);
2309:   VecGetArray(xx,&x);
2310:   {
2311:     /* x will first be computed in single precision then promoted inplace to double */
2312:     MatScalar      *v,*t=(MatScalar *)x;
2313:     int            nz,i,idt,ai16;
2314:     unsigned int   jdx,idx;
2315:     unsigned short *vi;
2316:     /* Forward solve the lower triangular factor. */

2318:     /* First block is the identity. */
2319:     idx  = 0;
2320:     CONVERT_DOUBLE4_FLOAT4(t,b);
2321:     v    =  aa + 16*((unsigned int)ai[1]);

2323:     for (i=1; i<n;) {
2324:       PREFETCH_NTA(&v[8]);
2325:       vi   =  aj      + ai[i];
2326:       nz   =  diag[i] - ai[i];
2327:       idx +=  4;

2329:       /* Demote RHS from double to float. */
2330:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2331:       LOAD_PS(&t[idx],XMM7);

2333:       while (nz--) {
2334:         PREFETCH_NTA(&v[16]);
2335:         jdx = 4*((unsigned int)(*vi++));
2336: 
2337:         /* 4x4 Matrix-Vector product with negative accumulation: */
2338:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2339:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2341:           /* First Column */
2342:           SSE_COPY_PS(XMM0,XMM6)
2343:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2344:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2345:           SSE_SUB_PS(XMM7,XMM0)

2347:           /* Second Column */
2348:           SSE_COPY_PS(XMM1,XMM6)
2349:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2350:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2351:           SSE_SUB_PS(XMM7,XMM1)

2353:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2354: 
2355:           /* Third Column */
2356:           SSE_COPY_PS(XMM2,XMM6)
2357:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2358:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2359:           SSE_SUB_PS(XMM7,XMM2)

2361:           /* Fourth Column */
2362:           SSE_COPY_PS(XMM3,XMM6)
2363:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2364:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2365:           SSE_SUB_PS(XMM7,XMM3)
2366:         SSE_INLINE_END_2
2367: 
2368:         v  += 16;
2369:       }
2370:       v    =  aa + 16*ai[++i];
2371:       PREFETCH_NTA(v);
2372:       STORE_PS(&t[idx],XMM7);
2373:     }

2375:     /* Backward solve the upper triangular factor.*/

2377:     idt  = 4*(n-1);
2378:     ai16 = 16*diag[n-1];
2379:     v    = aa + ai16 + 16;
2380:     for (i=n-1; i>=0;){
2381:       PREFETCH_NTA(&v[8]);
2382:       vi = aj + diag[i] + 1;
2383:       nz = ai[i+1] - diag[i] - 1;
2384: 
2385:       LOAD_PS(&t[idt],XMM7);

2387:       while (nz--) {
2388:         PREFETCH_NTA(&v[16]);
2389:         idx = 4*((unsigned int)(*vi++));

2391:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2392:         SSE_INLINE_BEGIN_2(&t[idx],v)
2393:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2395:           /* First Column */
2396:           SSE_COPY_PS(XMM0,XMM6)
2397:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2398:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2399:           SSE_SUB_PS(XMM7,XMM0)

2401:           /* Second Column */
2402:           SSE_COPY_PS(XMM1,XMM6)
2403:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2404:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2405:           SSE_SUB_PS(XMM7,XMM1)

2407:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2408: 
2409:           /* Third Column */
2410:           SSE_COPY_PS(XMM2,XMM6)
2411:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2412:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2413:           SSE_SUB_PS(XMM7,XMM2)

2415:           /* Fourth Column */
2416:           SSE_COPY_PS(XMM3,XMM6)
2417:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2418:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2419:           SSE_SUB_PS(XMM7,XMM3)
2420:         SSE_INLINE_END_2
2421:         v  += 16;
2422:       }
2423:       v    = aa + ai16;
2424:       ai16 = 16*diag[--i];
2425:       PREFETCH_NTA(aa+ai16+16);
2426:       /* 
2427:          Scale the result by the diagonal 4x4 block, 
2428:          which was inverted as part of the factorization
2429:       */
2430:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2431:         /* First Column */
2432:         SSE_COPY_PS(XMM0,XMM7)
2433:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2434:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2436:         /* Second Column */
2437:         SSE_COPY_PS(XMM1,XMM7)
2438:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2439:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2440:         SSE_ADD_PS(XMM0,XMM1)

2442:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2443: 
2444:         /* Third Column */
2445:         SSE_COPY_PS(XMM2,XMM7)
2446:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2447:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2448:         SSE_ADD_PS(XMM0,XMM2)

2450:         /* Fourth Column */
2451:         SSE_COPY_PS(XMM3,XMM7)
2452:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2453:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2454:         SSE_ADD_PS(XMM0,XMM3)

2456:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2457:       SSE_INLINE_END_3

2459:       v    = aa + ai16 + 16;
2460:       idt -= 4;
2461:     }

2463:     /* Convert t from single precision back to double precision (inplace)*/
2464:     idt = 4*(n-1);
2465:     for (i=n-1;i>=0;i--) {
2466:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2467:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2468:       PetscScalar *xtemp=&x[idt];
2469:       MatScalar   *ttemp=&t[idt];
2470:       xtemp[3] = (PetscScalar)ttemp[3];
2471:       xtemp[2] = (PetscScalar)ttemp[2];
2472:       xtemp[1] = (PetscScalar)ttemp[1];
2473:       xtemp[0] = (PetscScalar)ttemp[0];
2474:       idt -= 4;
2475:     }

2477:   } /* End of artificial scope. */
2478:   VecRestoreArray(bb,&b);
2479:   VecRestoreArray(xx,&x);
2480:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2481:   SSE_SCOPE_END;
2482:   return(0);
2483: }

2487: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2488: {
2489:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2490:   int            *aj=a->j;
2492:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2493:   MatScalar      *aa=a->a;
2494:   PetscScalar    *x,*b;

2497:   SSE_SCOPE_BEGIN;
2498:   /* 
2499:      Note: This code currently uses demotion of double
2500:      to float when performing the mixed-mode computation.
2501:      This may not be numerically reasonable for all applications.
2502:   */
2503:   PREFETCH_NTA(aa+16*ai[1]);

2505:   VecGetArray(bb,&b);
2506:   VecGetArray(xx,&x);
2507:   {
2508:     /* x will first be computed in single precision then promoted inplace to double */
2509:     MatScalar *v,*t=(MatScalar *)x;
2510:     int       nz,i,idt,ai16;
2511:     int       jdx,idx;
2512:     int       *vi;
2513:     /* Forward solve the lower triangular factor. */

2515:     /* First block is the identity. */
2516:     idx  = 0;
2517:     CONVERT_DOUBLE4_FLOAT4(t,b);
2518:     v    =  aa + 16*ai[1];

2520:     for (i=1; i<n;) {
2521:       PREFETCH_NTA(&v[8]);
2522:       vi   =  aj      + ai[i];
2523:       nz   =  diag[i] - ai[i];
2524:       idx +=  4;

2526:       /* Demote RHS from double to float. */
2527:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2528:       LOAD_PS(&t[idx],XMM7);

2530:       while (nz--) {
2531:         PREFETCH_NTA(&v[16]);
2532:         jdx = 4*(*vi++);
2533: /*          jdx = *vi++; */
2534: 
2535:         /* 4x4 Matrix-Vector product with negative accumulation: */
2536:         SSE_INLINE_BEGIN_2(&t[jdx],v)
2537:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2539:           /* First Column */
2540:           SSE_COPY_PS(XMM0,XMM6)
2541:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2542:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2543:           SSE_SUB_PS(XMM7,XMM0)

2545:           /* Second Column */
2546:           SSE_COPY_PS(XMM1,XMM6)
2547:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2548:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2549:           SSE_SUB_PS(XMM7,XMM1)

2551:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2552: 
2553:           /* Third Column */
2554:           SSE_COPY_PS(XMM2,XMM6)
2555:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2556:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2557:           SSE_SUB_PS(XMM7,XMM2)

2559:           /* Fourth Column */
2560:           SSE_COPY_PS(XMM3,XMM6)
2561:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2562:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2563:           SSE_SUB_PS(XMM7,XMM3)
2564:         SSE_INLINE_END_2
2565: 
2566:         v  += 16;
2567:       }
2568:       v    =  aa + 16*ai[++i];
2569:       PREFETCH_NTA(v);
2570:       STORE_PS(&t[idx],XMM7);
2571:     }

2573:     /* Backward solve the upper triangular factor.*/

2575:     idt  = 4*(n-1);
2576:     ai16 = 16*diag[n-1];
2577:     v    = aa + ai16 + 16;
2578:     for (i=n-1; i>=0;){
2579:       PREFETCH_NTA(&v[8]);
2580:       vi = aj + diag[i] + 1;
2581:       nz = ai[i+1] - diag[i] - 1;
2582: 
2583:       LOAD_PS(&t[idt],XMM7);

2585:       while (nz--) {
2586:         PREFETCH_NTA(&v[16]);
2587:         idx = 4*(*vi++);
2588: /*          idx = *vi++; */

2590:         /* 4x4 Matrix-Vector Product with negative accumulation: */
2591:         SSE_INLINE_BEGIN_2(&t[idx],v)
2592:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

2594:           /* First Column */
2595:           SSE_COPY_PS(XMM0,XMM6)
2596:           SSE_SHUFFLE(XMM0,XMM0,0x00)
2597:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2598:           SSE_SUB_PS(XMM7,XMM0)

2600:           /* Second Column */
2601:           SSE_COPY_PS(XMM1,XMM6)
2602:           SSE_SHUFFLE(XMM1,XMM1,0x55)
2603:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2604:           SSE_SUB_PS(XMM7,XMM1)

2606:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2607: 
2608:           /* Third Column */
2609:           SSE_COPY_PS(XMM2,XMM6)
2610:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2611:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2612:           SSE_SUB_PS(XMM7,XMM2)

2614:           /* Fourth Column */
2615:           SSE_COPY_PS(XMM3,XMM6)
2616:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2617:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2618:           SSE_SUB_PS(XMM7,XMM3)
2619:         SSE_INLINE_END_2
2620:         v  += 16;
2621:       }
2622:       v    = aa + ai16;
2623:       ai16 = 16*diag[--i];
2624:       PREFETCH_NTA(aa+ai16+16);
2625:       /* 
2626:          Scale the result by the diagonal 4x4 block, 
2627:          which was inverted as part of the factorization
2628:       */
2629:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2630:         /* First Column */
2631:         SSE_COPY_PS(XMM0,XMM7)
2632:         SSE_SHUFFLE(XMM0,XMM0,0x00)
2633:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

2635:         /* Second Column */
2636:         SSE_COPY_PS(XMM1,XMM7)
2637:         SSE_SHUFFLE(XMM1,XMM1,0x55)
2638:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2639:         SSE_ADD_PS(XMM0,XMM1)

2641:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2642: 
2643:         /* Third Column */
2644:         SSE_COPY_PS(XMM2,XMM7)
2645:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2646:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2647:         SSE_ADD_PS(XMM0,XMM2)

2649:         /* Fourth Column */
2650:         SSE_COPY_PS(XMM3,XMM7)
2651:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2652:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2653:         SSE_ADD_PS(XMM0,XMM3)

2655:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2656:       SSE_INLINE_END_3

2658:       v    = aa + ai16 + 16;
2659:       idt -= 4;
2660:     }

2662:     /* Convert t from single precision back to double precision (inplace)*/
2663:     idt = 4*(n-1);
2664:     for (i=n-1;i>=0;i--) {
2665:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2666:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2667:       PetscScalar *xtemp=&x[idt];
2668:       MatScalar   *ttemp=&t[idt];
2669:       xtemp[3] = (PetscScalar)ttemp[3];
2670:       xtemp[2] = (PetscScalar)ttemp[2];
2671:       xtemp[1] = (PetscScalar)ttemp[1];
2672:       xtemp[0] = (PetscScalar)ttemp[0];
2673:       idt -= 4;
2674:     }

2676:   } /* End of artificial scope. */
2677:   VecRestoreArray(bb,&b);
2678:   VecRestoreArray(xx,&x);
2679:   PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2680:   SSE_SCOPE_END;
2681:   return(0);
2682: }

2684: #endif
2687: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2688: {
2689:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2690:   IS                iscol=a->col,isrow=a->row;
2691:   PetscErrorCode    ierr;
2692:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2693:   PetscInt          *diag = a->diag;
2694:   const MatScalar   *aa=a->a,*v;
2695:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2696:   const PetscScalar *b;

2699:   VecGetArray(bb,(PetscScalar**)&b);
2700:   VecGetArray(xx,&x);
2701:   t  = a->solve_work;

2703:   ISGetIndices(isrow,&rout); r = rout;
2704:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2706:   /* forward solve the lower triangular */
2707:   idx    = 3*(*r++);
2708:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2709:   for (i=1; i<n; i++) {
2710:     v     = aa + 9*ai[i];
2711:     vi    = aj + ai[i];
2712:     nz    = diag[i] - ai[i];
2713:     idx   = 3*(*r++);
2714:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2715:     while (nz--) {
2716:       idx   = 3*(*vi++);
2717:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2718:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2719:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2720:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2721:       v += 9;
2722:     }
2723:     idx = 3*i;
2724:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2725:   }
2726:   /* backward solve the upper triangular */
2727:   for (i=n-1; i>=0; i--){
2728:     v    = aa + 9*diag[i] + 9;
2729:     vi   = aj + diag[i] + 1;
2730:     nz   = ai[i+1] - diag[i] - 1;
2731:     idt  = 3*i;
2732:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2733:     while (nz--) {
2734:       idx   = 3*(*vi++);
2735:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2736:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2737:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2738:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2739:       v += 9;
2740:     }
2741:     idc = 3*(*c--);
2742:     v   = aa + 9*diag[i];
2743:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2744:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2745:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2746:   }
2747:   ISRestoreIndices(isrow,&rout);
2748:   ISRestoreIndices(iscol,&cout);
2749:   VecRestoreArray(bb,(PetscScalar**)&b);
2750:   VecRestoreArray(xx,&x);
2751:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2752:   return(0);
2753: }

2755: /*
2756:       Special case where the matrix was ILU(0) factored in the natural
2757:    ordering. This eliminates the need for the column and row permutation.
2758: */
2761: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2762: {
2763:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2764:   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2765:   PetscErrorCode    ierr;
2766:   PetscInt          *diag = a->diag;
2767:   const MatScalar   *aa=a->a,*v;
2768:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2769:   const PetscScalar *b;
2770:   PetscInt          jdx,idt,idx,nz,*vi,i;

2773:   VecGetArray(bb,(PetscScalar**)&b);
2774:   VecGetArray(xx,&x);

2776:   /* forward solve the lower triangular */
2777:   idx    = 0;
2778:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2779:   for (i=1; i<n; i++) {
2780:     v     =  aa      + 9*ai[i];
2781:     vi    =  aj      + ai[i];
2782:     nz    =  diag[i] - ai[i];
2783:     idx   +=  3;
2784:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2785:     while (nz--) {
2786:       jdx   = 3*(*vi++);
2787:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2788:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2789:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2790:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2791:       v    += 9;
2792:     }
2793:     x[idx]   = s1;
2794:     x[1+idx] = s2;
2795:     x[2+idx] = s3;
2796:   }
2797:   /* backward solve the upper triangular */
2798:   for (i=n-1; i>=0; i--){
2799:     v    = aa + 9*diag[i] + 9;
2800:     vi   = aj + diag[i] + 1;
2801:     nz   = ai[i+1] - diag[i] - 1;
2802:     idt  = 3*i;
2803:     s1 = x[idt];  s2 = x[1+idt];
2804:     s3 = x[2+idt];
2805:     while (nz--) {
2806:       idx   = 3*(*vi++);
2807:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2808:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2809:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2810:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2811:       v    += 9;
2812:     }
2813:     v        = aa +  9*diag[i];
2814:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2815:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2816:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2817:   }

2819:   VecRestoreArray(bb,(PetscScalar**)&b);
2820:   VecRestoreArray(xx,&x);
2821:   PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2822:   return(0);
2823: }

2827: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2828: {
2829:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2830:   IS                iscol=a->col,isrow=a->row;
2831:   PetscErrorCode    ierr;
2832:   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2833:   PetscInt          *diag = a->diag;
2834:   const MatScalar   *aa=a->a,*v;
2835:   PetscScalar       *x,s1,s2,x1,x2,*t;
2836:   const PetscScalar *b;

2839:   VecGetArray(bb,(PetscScalar**)&b);
2840:   VecGetArray(xx,&x);
2841:   t  = a->solve_work;

2843:   ISGetIndices(isrow,&rout); r = rout;
2844:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2846:   /* forward solve the lower triangular */
2847:   idx    = 2*(*r++);
2848:   t[0] = b[idx]; t[1] = b[1+idx];
2849:   for (i=1; i<n; i++) {
2850:     v     = aa + 4*ai[i];
2851:     vi    = aj + ai[i];
2852:     nz    = diag[i] - ai[i];
2853:     idx   = 2*(*r++);
2854:     s1  = b[idx]; s2 = b[1+idx];
2855:     while (nz--) {
2856:       idx   = 2*(*vi++);
2857:       x1    = t[idx]; x2 = t[1+idx];
2858:       s1 -= v[0]*x1 + v[2]*x2;
2859:       s2 -= v[1]*x1 + v[3]*x2;
2860:       v += 4;
2861:     }
2862:     idx = 2*i;
2863:     t[idx] = s1; t[1+idx] = s2;
2864:   }
2865:   /* backward solve the upper triangular */
2866:   for (i=n-1; i>=0; i--){
2867:     v    = aa + 4*diag[i] + 4;
2868:     vi   = aj + diag[i] + 1;
2869:     nz   = ai[i+1] - diag[i] - 1;
2870:     idt  = 2*i;
2871:     s1 = t[idt]; s2 = t[1+idt];
2872:     while (nz--) {
2873:       idx   = 2*(*vi++);
2874:       x1    = t[idx]; x2 = t[1+idx];
2875:       s1 -= v[0]*x1 + v[2]*x2;
2876:       s2 -= v[1]*x1 + v[3]*x2;
2877:       v += 4;
2878:     }
2879:     idc = 2*(*c--);
2880:     v   = aa + 4*diag[i];
2881:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2882:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2883:   }
2884:   ISRestoreIndices(isrow,&rout);
2885:   ISRestoreIndices(iscol,&cout);
2886:   VecRestoreArray(bb,(PetscScalar**)&b);
2887:   VecRestoreArray(xx,&x);
2888:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2889:   return(0);
2890: }

2892: /*
2893:       Special case where the matrix was ILU(0) factored in the natural
2894:    ordering. This eliminates the need for the column and row permutation.
2895: */
2898: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2899: {
2900:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2901:   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2902:   PetscErrorCode    ierr;
2903:   PetscInt          *diag = a->diag;
2904:   const MatScalar   *aa=a->a,*v;
2905:   PetscScalar       *x,s1,s2,x1,x2;
2906:   const PetscScalar *b;
2907:   PetscInt          jdx,idt,idx,nz,*vi,i;

2910:   VecGetArray(bb,(PetscScalar**)&b);
2911:   VecGetArray(xx,&x);

2913:   /* forward solve the lower triangular */
2914:   idx    = 0;
2915:   x[0]   = b[0]; x[1] = b[1];
2916:   for (i=1; i<n; i++) {
2917:     v     =  aa      + 4*ai[i];
2918:     vi    =  aj      + ai[i];
2919:     nz    =  diag[i] - ai[i];
2920:     idx   +=  2;
2921:     s1  =  b[idx];s2 = b[1+idx];
2922:     while (nz--) {
2923:       jdx   = 2*(*vi++);
2924:       x1    = x[jdx];x2 = x[1+jdx];
2925:       s1 -= v[0]*x1 + v[2]*x2;
2926:       s2 -= v[1]*x1 + v[3]*x2;
2927:       v    += 4;
2928:     }
2929:     x[idx]   = s1;
2930:     x[1+idx] = s2;
2931:   }
2932:   /* backward solve the upper triangular */
2933:   for (i=n-1; i>=0; i--){
2934:     v    = aa + 4*diag[i] + 4;
2935:     vi   = aj + diag[i] + 1;
2936:     nz   = ai[i+1] - diag[i] - 1;
2937:     idt  = 2*i;
2938:     s1 = x[idt];  s2 = x[1+idt];
2939:     while (nz--) {
2940:       idx   = 2*(*vi++);
2941:       x1    = x[idx];   x2 = x[1+idx];
2942:       s1 -= v[0]*x1 + v[2]*x2;
2943:       s2 -= v[1]*x1 + v[3]*x2;
2944:       v    += 4;
2945:     }
2946:     v        = aa +  4*diag[i];
2947:     x[idt]   = v[0]*s1 + v[2]*s2;
2948:     x[1+idt] = v[1]*s1 + v[3]*s2;
2949:   }

2951:   VecRestoreArray(bb,(PetscScalar**)&b);
2952:   VecRestoreArray(xx,&x);
2953:   PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2954:   return(0);
2955: }

2959: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2960: {
2961:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2962:   IS             iscol=a->col,isrow=a->row;
2964:   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2965:   PetscInt       *diag = a->diag;
2966:   MatScalar      *aa=a->a,*v;
2967:   PetscScalar    *x,*b,s1,*t;

2970:   if (!n) return(0);

2972:   VecGetArray(bb,&b);
2973:   VecGetArray(xx,&x);
2974:   t  = a->solve_work;

2976:   ISGetIndices(isrow,&rout); r = rout;
2977:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2979:   /* forward solve the lower triangular */
2980:   t[0] = b[*r++];
2981:   for (i=1; i<n; i++) {
2982:     v     = aa + ai[i];
2983:     vi    = aj + ai[i];
2984:     nz    = diag[i] - ai[i];
2985:     s1  = b[*r++];
2986:     while (nz--) {
2987:       s1 -= (*v++)*t[*vi++];
2988:     }
2989:     t[i] = s1;
2990:   }
2991:   /* backward solve the upper triangular */
2992:   for (i=n-1; i>=0; i--){
2993:     v    = aa + diag[i] + 1;
2994:     vi   = aj + diag[i] + 1;
2995:     nz   = ai[i+1] - diag[i] - 1;
2996:     s1 = t[i];
2997:     while (nz--) {
2998:       s1 -= (*v++)*t[*vi++];
2999:     }
3000:     x[*c--] = t[i] = aa[diag[i]]*s1;
3001:   }

3003:   ISRestoreIndices(isrow,&rout);
3004:   ISRestoreIndices(iscol,&cout);
3005:   VecRestoreArray(bb,&b);
3006:   VecRestoreArray(xx,&x);
3007:   PetscLogFlops(2*1*(a->nz) - A->cmap.n);
3008:   return(0);
3009: }
3010: /*
3011:       Special case where the matrix was ILU(0) factored in the natural
3012:    ordering. This eliminates the need for the column and row permutation.
3013: */
3016: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3017: {
3018:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3019:   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3021:   PetscInt       *diag = a->diag;
3022:   MatScalar      *aa=a->a;
3023:   PetscScalar    *x,*b;
3024:   PetscScalar    s1,x1;
3025:   MatScalar      *v;
3026:   PetscInt       jdx,idt,idx,nz,*vi,i;

3029:   VecGetArray(bb,&b);
3030:   VecGetArray(xx,&x);

3032:   /* forward solve the lower triangular */
3033:   idx    = 0;
3034:   x[0]   = b[0];
3035:   for (i=1; i<n; i++) {
3036:     v     =  aa      + ai[i];
3037:     vi    =  aj      + ai[i];
3038:     nz    =  diag[i] - ai[i];
3039:     idx   +=  1;
3040:     s1  =  b[idx];
3041:     while (nz--) {
3042:       jdx   = *vi++;
3043:       x1    = x[jdx];
3044:       s1 -= v[0]*x1;
3045:       v    += 1;
3046:     }
3047:     x[idx]   = s1;
3048:   }
3049:   /* backward solve the upper triangular */
3050:   for (i=n-1; i>=0; i--){
3051:     v    = aa + diag[i] + 1;
3052:     vi   = aj + diag[i] + 1;
3053:     nz   = ai[i+1] - diag[i] - 1;
3054:     idt  = i;
3055:     s1 = x[idt];
3056:     while (nz--) {
3057:       idx   = *vi++;
3058:       x1    = x[idx];
3059:       s1 -= v[0]*x1;
3060:       v    += 1;
3061:     }
3062:     v        = aa +  diag[i];
3063:     x[idt]   = v[0]*s1;
3064:   }
3065:   VecRestoreArray(bb,&b);
3066:   VecRestoreArray(xx,&x);
3067:   PetscLogFlops(2*(a->nz) - A->cmap.n);
3068:   return(0);
3069: }

3071: /* ----------------------------------------------------------------*/
3072: /*
3073:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3074:    except that the data structure of Mat_SeqAIJ is slightly different.
3075:    Not a good example of code reuse.
3076: */

3080: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3081: {
3082:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3083:   IS             isicol;
3085:   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3086:   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3087:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3088:   PetscInt       incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3089:   PetscTruth     col_identity,row_identity,flg;
3090:   PetscReal      f;

3093:   f             = info->fill;
3094:   levels        = (PetscInt)info->levels;
3095:   diagonal_fill = (PetscInt)info->diagonal_fill;
3096:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3097:   ISIdentity(isrow,&row_identity);
3098:   ISIdentity(iscol,&col_identity);

3100:   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3101:     MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3102:     (*fact)->factor = FACTOR_LU;
3103:     b               = (Mat_SeqBAIJ*)(*fact)->data;
3104:     MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);
3105:     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3106:     b->row        = isrow;
3107:     b->col        = iscol;
3108:     PetscObjectReference((PetscObject)isrow);
3109:     PetscObjectReference((PetscObject)iscol);
3110:     b->icol       = isicol;
3111:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3112:     PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);
3113:   } else { /* general case perform the symbolic factorization */
3114:     ISGetIndices(isrow,&r);
3115:     ISGetIndices(isicol,&ic);

3117:     /* get new row pointers */
3118:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3119:     ainew[0] = 0;
3120:     /* don't know how many column pointers are needed so estimate */
3121:     jmax = (PetscInt)(f*ai[n] + 1);
3122:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3123:     /* ajfill is level of fill for each fill entry */
3124:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3125:     /* fill is a linked list of nonzeros in active row */
3126:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3127:     /* im is level for each filled value */
3128:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
3129:     /* dloc is location of diagonal in factor */
3130:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3131:     dloc[0]  = 0;
3132:     for (prow=0; prow<n; prow++) {

3134:       /* copy prow into linked list */
3135:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3136:       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3137:       xi         = aj + ai[r[prow]];
3138:       fill[n]    = n;
3139:       fill[prow] = -1; /* marker for diagonal entry */
3140:       while (nz--) {
3141:         fm  = n;
3142:         idx = ic[*xi++];
3143:         do {
3144:           m  = fm;
3145:           fm = fill[m];
3146:         } while (fm < idx);
3147:         fill[m]   = idx;
3148:         fill[idx] = fm;
3149:         im[idx]   = 0;
3150:       }

3152:       /* make sure diagonal entry is included */
3153:       if (diagonal_fill && fill[prow] == -1) {
3154:         fm = n;
3155:         while (fill[fm] < prow) fm = fill[fm];
3156:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
3157:         fill[fm]   = prow;
3158:         im[prow]   = 0;
3159:         nzf++;
3160:         dcount++;
3161:       }

3163:       nzi = 0;
3164:       row = fill[n];
3165:       while (row < prow) {
3166:         incrlev = im[row] + 1;
3167:         nz      = dloc[row];
3168:         xi      = ajnew  + ainew[row] + nz + 1;
3169:         flev    = ajfill + ainew[row] + nz + 1;
3170:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
3171:         fm      = row;
3172:         while (nnz-- > 0) {
3173:           idx = *xi++;
3174:           if (*flev + incrlev > levels) {
3175:             flev++;
3176:             continue;
3177:           }
3178:           do {
3179:             m  = fm;
3180:             fm = fill[m];
3181:           } while (fm < idx);
3182:           if (fm != idx) {
3183:             im[idx]   = *flev + incrlev;
3184:             fill[m]   = idx;
3185:             fill[idx] = fm;
3186:             fm        = idx;
3187:             nzf++;
3188:           } else {
3189:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3190:           }
3191:           flev++;
3192:         }
3193:         row = fill[row];
3194:         nzi++;
3195:       }
3196:       /* copy new filled row into permanent storage */
3197:       ainew[prow+1] = ainew[prow] + nzf;
3198:       if (ainew[prow+1] > jmax) {

3200:         /* estimate how much additional space we will need */
3201:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3202:         /* just double the memory each time */
3203:         PetscInt maxadd = jmax;
3204:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3205:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3206:         jmax += maxadd;

3208:         /* allocate a longer ajnew and ajfill */
3209:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3210:         PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3211:         PetscFree(ajnew);
3212:         ajnew = xi;
3213:         PetscMalloc(jmax*sizeof(PetscInt),&xi);
3214:         PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3215:         PetscFree(ajfill);
3216:         ajfill = xi;
3217:         reallocate++; /* count how many reallocations are needed */
3218:       }
3219:       xi          = ajnew + ainew[prow];
3220:       flev        = ajfill + ainew[prow];
3221:       dloc[prow]  = nzi;
3222:       fm          = fill[n];
3223:       while (nzf--) {
3224:         *xi++   = fm;
3225:         *flev++ = im[fm];
3226:         fm      = fill[fm];
3227:       }
3228:       /* make sure row has diagonal entry */
3229:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3230:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3231:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3232:       }
3233:     }
3234:     PetscFree(ajfill);
3235:     ISRestoreIndices(isrow,&r);
3236:     ISRestoreIndices(isicol,&ic);
3237:     PetscFree(fill);
3238:     PetscFree(im);

3240: #if defined(PETSC_USE_INFO)
3241:     {
3242:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3243:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3244:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3245:       PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3246:       PetscInfo(A,"for best performance.\n");
3247:       if (diagonal_fill) {
3248:         PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3249:       }
3250:     }
3251: #endif

3253:     /* put together the new matrix */
3254:     MatCreate(((PetscObject)A)->comm,fact);
3255:     MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);
3256:     MatSetType(*fact,((PetscObject)A)->type_name);
3257:     MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3258:     PetscLogObjectParent(*fact,isicol);
3259:     b    = (Mat_SeqBAIJ*)(*fact)->data;
3260:     b->free_a       = PETSC_TRUE;
3261:     b->free_ij      = PETSC_TRUE;
3262:     b->singlemalloc = PETSC_FALSE;
3263:     PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3264:     b->j          = ajnew;
3265:     b->i          = ainew;
3266:     for (i=0; i<n; i++) dloc[i] += ainew[i];
3267:     b->diag       = dloc;
3268:     b->ilen       = 0;
3269:     b->imax       = 0;
3270:     b->row        = isrow;
3271:     b->col        = iscol;
3272:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3273:     PetscObjectReference((PetscObject)isrow);
3274:     PetscObjectReference((PetscObject)iscol);
3275:     b->icol       = isicol;
3276:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3277:     /* In b structure:  Free imax, ilen, old a, old j.  
3278:        Allocate dloc, solve_work, new a, new j */
3279:     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3280:     b->maxnz          = b->nz = ainew[n];
3281:     (*fact)->factor   = FACTOR_LU;

3283:     (*fact)->info.factor_mallocs    = reallocate;
3284:     (*fact)->info.fill_ratio_given  = f;
3285:     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3286:   }

3288:   if (row_identity && col_identity) {
3289:     MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3290:   }
3291:   return(0);
3292: }

3296: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3297: {
3298:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3299:   /* int i,*AJ=a->j,nz=a->nz; */
3301:   /* Undo Column scaling */
3302: /*    while (nz--) { */
3303: /*      AJ[i] = AJ[i]/4; */
3304: /*    } */
3305:   /* This should really invoke a push/pop logic, but we don't have that yet. */
3306:   A->ops->setunfactored = PETSC_NULL;
3307:   return(0);
3308: }

3312: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3313: {
3314:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3315:   PetscInt       *AJ=a->j,nz=a->nz;
3316:   unsigned short *aj=(unsigned short *)AJ;
3318:   /* Is this really necessary? */
3319:   while (nz--) {
3320:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3321:   }
3322:   A->ops->setunfactored = PETSC_NULL;
3323:   return(0);
3324: }

3328: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3329: {
3331:   /*
3332:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3333:       with natural ordering
3334:   */
3336:   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3337:   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
3338:   switch (inA->rmap.bs) {
3339:   case 1:
3340:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3341:     PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");
3342:     break;
3343:   case 2:
3344:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3345:     PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");
3346:     break;
3347:   case 3:
3348:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3349:     PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");
3350:     break;
3351:   case 4:
3352: #if defined(PETSC_USE_MAT_SINGLE)
3353:     {
3354:       PetscTruth  sse_enabled_local;
3356:       PetscSSEIsEnabled(((PetscObject)inA)->comm,&sse_enabled_local,PETSC_NULL);
3357:       if (sse_enabled_local) {
3358: #  if defined(PETSC_HAVE_SSE)
3359:         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3360:         if (n==(unsigned short)n) {
3361:           unsigned short *aj=(unsigned short *)AJ;
3362:           for (i=0;i<nz;i++) {
3363:             aj[i] = (unsigned short)AJ[i];
3364:           }
3365:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3366:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3367:           PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3368:         } else {
3369:         /* Scale the column indices for easier indexing in MatSolve. */
3370: /*            for (i=0;i<nz;i++) { */
3371: /*              AJ[i] = AJ[i]*4; */
3372: /*            } */
3373:           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3374:           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3375:           PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3376:         }
3377: #  else
3378:       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3379:         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3380: #  endif
3381:       } else {
3382:         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3383:         PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3384:       }
3385:     }
3386: #else
3387:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3388:     PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3389: #endif
3390:     break;
3391:   case 5:
3392:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3393:     PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");
3394:     break;
3395:   case 6:
3396:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3397:     PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");
3398:     break;
3399:   case 7:
3400:     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3401:     PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");
3402:     break;
3403:   }
3404:   return(0);
3405: }

3409: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3410: {
3411:   /*
3412:       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 
3413:       with natural ordering
3414:   */
3415:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
3416:   IS             row = a->row, col = a->col;
3417:   PetscTruth     row_identity, col_identity;
3418:   PetscTruth     use_natural;


3423:   use_natural = PETSC_FALSE;
3424:   if (row && col) {
3425:     ISIdentity(row,&row_identity);
3426:     ISIdentity(col,&col_identity);

3428:     if (row_identity && col_identity) {
3429:       use_natural = PETSC_TRUE;
3430:     }
3431:   } else {
3432:     use_natural = PETSC_TRUE;
3433:   }

3435:   switch (A->rmap.bs) {
3436:   case 1:
3437:     if (use_natural) {
3438:       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3439:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3440:       PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");
3441:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3442:     } else {
3443:       A->ops->solve           = MatSolve_SeqBAIJ_1;
3444:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3445:     }
3446:     break;
3447:   case 2:
3448:     if (use_natural) {
3449:       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3450:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3451:       PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");
3452:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3453:     } else {
3454:       A->ops->solve           = MatSolve_SeqBAIJ_2;
3455:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3456:     }
3457:     break;
3458:   case 3:
3459:     if (use_natural) {
3460:       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3461:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3462:       PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");
3463:       PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3464:     } else {
3465:       A->ops->solve           = MatSolve_SeqBAIJ_3;
3466:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3467:     }
3468:     break;
3469:   case 4:
3470:     {
3471:       PetscTruth sse_enabled_local;
3472:       PetscSSEIsEnabled(((PetscObject)A)->comm,&sse_enabled_local,PETSC_NULL);
3473:       if (use_natural) {
3474: #if defined(PETSC_USE_MAT_SINGLE)
3475:         if (sse_enabled_local) { /* Natural + Single + SSE */
3476: #  if defined(PETSC_HAVE_SSE)
3477:           PetscInt n=a->mbs;
3478:           if (n==(unsigned short)n) {
3479:             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3480:             PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3481:           } else {
3482:             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3483:             PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3484:           }
3485: #  else
3486:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3487:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3488: #  endif
3489:         } else { /* Natural + Single */
3490:           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3491:           PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");
3492:         }
3493: #else
3494:         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3495:         PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3496: #endif
3497:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3498:         PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3499:       } else { /* Arbitrary ordering */
3500: #if defined(PETSC_USE_MAT_SINGLE)
3501:         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3502: #  if defined(PETSC_HAVE_SSE)
3503:           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3504:           PetscInfo(A,"Using single precision, SSE solve BS=4\n");
3505: #  else
3506:           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3507:           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3508: #  endif
3509:         } else { /* Arbitrary + Single */
3510:           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3511:           PetscInfo(A,"Using single precision solve BS=4\n");
3512:         }
3513: #else
3514:         A->ops->solve           = MatSolve_SeqBAIJ_4;
3515: #endif
3516:         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3517:       }
3518:     }
3519:     break;
3520:   case 5:
3521:     if (use_natural) {
3522:       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3523:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3524:       PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3525:       PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3526:     } else {
3527:       A->ops->solve           = MatSolve_SeqBAIJ_5;
3528:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3529:     }
3530:     break;
3531:   case 6:
3532:     if (use_natural) {
3533:       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3534:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3535:       PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3536:       PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3537:     } else {
3538:       A->ops->solve           = MatSolve_SeqBAIJ_6;
3539:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3540:     }
3541:     break;
3542:   case 7:
3543:     if (use_natural) {
3544:       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3545:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3546:       PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3547:       PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3548:     } else {
3549:       A->ops->solve           = MatSolve_SeqBAIJ_7;
3550:       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3551:     }
3552:     break;
3553:   default:
3554:     A->ops->solve             = MatSolve_SeqBAIJ_N;
3555:     break;
3556:   }
3557:   return(0);
3558: }

3562: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3563: {

3567:   MatSeqBAIJ_UpdateSolvers(A);
3568:   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3569:     (*A->ops->solve)(A,x,y);
3570:   } else {
3571:     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3572:   }
3573:   return(0);
3574: }

3578: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3579: {

3583:   MatSeqBAIJ_UpdateSolvers(A);
3584:   (*A->ops->solvetranspose)(A,x,y);
3585:   return(0);
3586: }