Actual source code: dgefa4.c

  1: /*
  2:        Inverts 4 by 4 matrix using partial pivoting.

  4:        Used by the sparse factorization routines in 
  5:      src/mat/impls/baij/seq and src/mat/impls/bdiag/seq

  7:        See also src/inline/ilu.h

  9:        This is a combination of the Linpack routines
 10:     dgefa() and dgedi() specialized for a size of 4.

 12: */
 13:  #include petsc.h

 17: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a)
 18: {
 19:     PetscInt   i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
 20:     PetscInt   k4,j3;
 21:     MatScalar  *aa,*ax,*ay,work[16],stmp;
 22:     MatReal    tmp,max;

 24: /*     gaussian elimination with partial pivoting */

 27:     /* Parameter adjustments */
 28:     a       -= 5;

 30:     for (k = 1; k <= 3; ++k) {
 31:         kp1 = k + 1;
 32:         k3  = 4*k;
 33:         k4  = k3 + k;
 34: /*        find l = pivot index */

 36:         i__2 = 4 - k;
 37:         aa = &a[k4];
 38:         max = PetscAbsScalar(aa[0]);
 39:         l = 1;
 40:         for (ll=1; ll<i__2; ll++) {
 41:           tmp = PetscAbsScalar(aa[ll]);
 42:           if (tmp > max) { max = tmp; l = ll+1;}
 43:         }
 44:         l       += k - 1;
 45:         ipvt[k-1] = l;

 47:         if (a[l + k3] == 0.0) {
 48:           SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
 49:         }

 51: /*           interchange if necessary */

 53:         if (l != k) {
 54:           stmp      = a[l + k3];
 55:           a[l + k3] = a[k4];
 56:           a[k4]     = stmp;
 57:         }

 59: /*           compute multipliers */

 61:         stmp = -1. / a[k4];
 62:         i__2 = 4 - k;
 63:         aa = &a[1 + k4];
 64:         for (ll=0; ll<i__2; ll++) {
 65:           aa[ll] *= stmp;
 66:         }

 68: /*           row elimination with column indexing */

 70:         ax = &a[k4+1];
 71:         for (j = kp1; j <= 4; ++j) {
 72:             j3   = 4*j;
 73:             stmp = a[l + j3];
 74:             if (l != k) {
 75:               a[l + j3] = a[k + j3];
 76:               a[k + j3] = stmp;
 77:             }

 79:             i__3 = 4 - k;
 80:             ay = &a[1+k+j3];
 81:             for (ll=0; ll<i__3; ll++) {
 82:               ay[ll] += stmp*ax[ll];
 83:             }
 84:         }
 85:     }
 86:     ipvt[3] = 4;
 87:     if (a[20] == 0.0) {
 88:       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
 89:     }

 91:     /*
 92:          Now form the inverse 
 93:     */

 95:    /*     compute inverse(u) */

 97:     for (k = 1; k <= 4; ++k) {
 98:         k3    = 4*k;
 99:         k4    = k3 + k;
100:         a[k4] = 1.0 / a[k4];
101:         stmp  = -a[k4];
102:         i__2  = k - 1;
103:         aa    = &a[k3 + 1];
104:         for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
105:         kp1 = k + 1;
106:         if (4 < kp1) continue;
107:         ax = aa;
108:         for (j = kp1; j <= 4; ++j) {
109:             j3        = 4*j;
110:             stmp      = a[k + j3];
111:             a[k + j3] = 0.0;
112:             ay        = &a[j3 + 1];
113:             for (ll=0; ll<k; ll++) {
114:               ay[ll] += stmp*ax[ll];
115:             }
116:         }
117:     }

119:    /*    form inverse(u)*inverse(l) */

121:     for (kb = 1; kb <= 3; ++kb) {
122:         k   = 4 - kb;
123:         k3  = 4*k;
124:         kp1 = k + 1;
125:         aa  = a + k3;
126:         for (i = kp1; i <= 4; ++i) {
127:             work[i-1] = aa[i];
128:             aa[i]   = 0.0;
129:         }
130:         for (j = kp1; j <= 4; ++j) {
131:             stmp  = work[j-1];
132:             ax    = &a[4*j + 1];
133:             ay    = &a[k3 + 1];
134:             ay[0] += stmp*ax[0];
135:             ay[1] += stmp*ax[1];
136:             ay[2] += stmp*ax[2];
137:             ay[3] += stmp*ax[3];
138:         }
139:         l = ipvt[k-1];
140:         if (l != k) {
141:             ax = &a[k3 + 1];
142:             ay = &a[4*l + 1];
143:             stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
144:             stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
145:             stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
146:             stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
147:         }
148:     }
149:     return(0);
150: }

152: #if defined(PETSC_HAVE_SSE)
153: #include PETSC_HAVE_SSE

157: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
158: {
159:   /* 
160:      This routine is converted from Intel's Small Matrix Library.
161:      See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
162:      Order Number: 245043-001
163:      March 1999
164:      http://www.intel.com

166:      Inverse of a 4x4 matrix via Kramer's Rule:
167:      bool Invert4x4(SMLXMatrix &);
168:   */

171:   SSE_SCOPE_BEGIN;
172:     SSE_INLINE_BEGIN_1(a)

174: /* ----------------------------------------------- */

176:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
177:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)

179:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
180:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)

182:       SSE_COPY_PS(XMM3,XMM0)
183:       SSE_SHUFFLE(XMM3,XMM5,0x88)

185:       SSE_SHUFFLE(XMM5,XMM0,0xDD)

187:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
188:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)

190:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
191:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)

193:       SSE_COPY_PS(XMM4,XMM0)
194:       SSE_SHUFFLE(XMM4,XMM6,0x88)

196:       SSE_SHUFFLE(XMM6,XMM0,0xDD)

198: /* ----------------------------------------------- */

200:       SSE_COPY_PS(XMM7,XMM4)
201:       SSE_MULT_PS(XMM7,XMM6)

203:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

205:       SSE_COPY_PS(XMM0,XMM5)
206:       SSE_MULT_PS(XMM0,XMM7)

208:       SSE_COPY_PS(XMM2,XMM3)
209:       SSE_MULT_PS(XMM2,XMM7)

211:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

213:       SSE_COPY_PS(XMM1,XMM5)
214:       SSE_MULT_PS(XMM1,XMM7)
215:       SSE_SUB_PS(XMM1,XMM0)

217:       SSE_MULT_PS(XMM7,XMM3)
218:       SSE_SUB_PS(XMM7,XMM2)

220:       SSE_SHUFFLE(XMM7,XMM7,0x4E)
221:       SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)

223: /* ----------------------------------------------- */

225:       SSE_COPY_PS(XMM0,XMM5)
226:       SSE_MULT_PS(XMM0,XMM4)

228:       SSE_SHUFFLE(XMM0,XMM0,0xB1)

230:       SSE_COPY_PS(XMM2,XMM6)
231:       SSE_MULT_PS(XMM2,XMM0)
232:       SSE_ADD_PS(XMM2,XMM1)
233: 
234:       SSE_COPY_PS(XMM7,XMM3)
235:       SSE_MULT_PS(XMM7,XMM0)

237:       SSE_SHUFFLE(XMM0,XMM0,0x4E)

239:       SSE_COPY_PS(XMM1,XMM6)
240:       SSE_MULT_PS(XMM1,XMM0)
241:       SSE_SUB_PS(XMM2,XMM1)

243:       SSE_MULT_PS(XMM0,XMM3)
244:       SSE_SUB_PS(XMM0,XMM7)

246:       SSE_SHUFFLE(XMM0,XMM0,0x4E)
247:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)

249:       /* ----------------------------------------------- */

251:       SSE_COPY_PS(XMM7,XMM5)
252:       SSE_SHUFFLE(XMM7,XMM5,0x4E)
253:       SSE_MULT_PS(XMM7,XMM6)

255:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

257:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

259:       SSE_COPY_PS(XMM0,XMM4)
260:       SSE_MULT_PS(XMM0,XMM7)
261:       SSE_ADD_PS(XMM0,XMM2)

263:       SSE_COPY_PS(XMM2,XMM3)
264:       SSE_MULT_PS(XMM2,XMM7)

266:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

268:       SSE_COPY_PS(XMM1,XMM4)
269:       SSE_MULT_PS(XMM1,XMM7)
270:       SSE_SUB_PS(XMM0,XMM1)
271:       SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)

273:       SSE_MULT_PS(XMM7,XMM3)
274:       SSE_SUB_PS(XMM7,XMM2)

276:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

278:       /* ----------------------------------------------- */

280:       SSE_COPY_PS(XMM1,XMM3)
281:       SSE_MULT_PS(XMM1,XMM5)

283:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

285:       SSE_COPY_PS(XMM0,XMM6)
286:       SSE_MULT_PS(XMM0,XMM1)
287:       SSE_ADD_PS(XMM0,XMM7)
288: 
289:       SSE_COPY_PS(XMM2,XMM4)
290:       SSE_MULT_PS(XMM2,XMM1)
291:       SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)

293:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

295:       SSE_COPY_PS(XMM7,XMM6)
296:       SSE_MULT_PS(XMM7,XMM1)
297:       SSE_SUB_PS(XMM7,XMM0)

299:       SSE_MULT_PS(XMM1,XMM4)
300:       SSE_SUB_PS(XMM2,XMM1)
301:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)

303:       /* ----------------------------------------------- */

305:       SSE_COPY_PS(XMM1,XMM3)
306:       SSE_MULT_PS(XMM1,XMM6)

308:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

310:       SSE_COPY_PS(XMM2,XMM4)
311:       SSE_MULT_PS(XMM2,XMM1)
312:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
313:       SSE_SUB_PS(XMM0,XMM2)

315:       SSE_COPY_PS(XMM2,XMM5)
316:       SSE_MULT_PS(XMM2,XMM1)
317:       SSE_ADD_PS(XMM2,XMM7)

319:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

321:       SSE_COPY_PS(XMM7,XMM4)
322:       SSE_MULT_PS(XMM7,XMM1)
323:       SSE_ADD_PS(XMM7,XMM0)

325:       SSE_MULT_PS(XMM1,XMM5)
326:       SSE_SUB_PS(XMM2,XMM1)

328:       /* ----------------------------------------------- */

330:       SSE_MULT_PS(XMM4,XMM3)

332:       SSE_SHUFFLE(XMM4,XMM4,0xB1)

334:       SSE_COPY_PS(XMM1,XMM6)
335:       SSE_MULT_PS(XMM1,XMM4)
336:       SSE_ADD_PS(XMM1,XMM7)

338:       SSE_COPY_PS(XMM0,XMM5)
339:       SSE_MULT_PS(XMM0,XMM4)
340:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
341:       SSE_SUB_PS(XMM7,XMM0)

343:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

345:       SSE_MULT_PS(XMM6,XMM4)
346:       SSE_SUB_PS(XMM1,XMM6)

348:       SSE_MULT_PS(XMM5,XMM4)
349:       SSE_ADD_PS(XMM5,XMM7)

351:       /* ----------------------------------------------- */

353:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
354:       SSE_MULT_PS(XMM3,XMM0)

356:       SSE_COPY_PS(XMM4,XMM3)
357:       SSE_SHUFFLE(XMM4,XMM3,0x4E)
358:       SSE_ADD_PS(XMM4,XMM3)

360:       SSE_COPY_PS(XMM6,XMM4)
361:       SSE_SHUFFLE(XMM6,XMM4,0xB1)
362:       SSE_ADD_SS(XMM6,XMM4)

364:       SSE_COPY_PS(XMM3,XMM6)
365:       SSE_RECIP_SS(XMM3,XMM6)
366:       SSE_COPY_SS(XMM4,XMM3)
367:       SSE_ADD_SS(XMM4,XMM3)
368:       SSE_MULT_SS(XMM3,XMM3)
369:       SSE_MULT_SS(XMM6,XMM3)
370:       SSE_SUB_SS(XMM4,XMM6)

372:       SSE_SHUFFLE(XMM4,XMM4,0x00)

374:       SSE_MULT_PS(XMM0,XMM4)
375:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
376:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)

378:       SSE_MULT_PS(XMM1,XMM4)
379:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
380:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)

382:       SSE_MULT_PS(XMM2,XMM4)
383:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
384:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)

386:       SSE_MULT_PS(XMM4,XMM5)
387:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
388:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)

390:       /* ----------------------------------------------- */

392:       SSE_INLINE_END_1;
393:   SSE_SCOPE_END;

395:   return(0);
396: }

398: #endif