Bug Summary

File:mat/impls/baij/seq/baijfact11.c
Warning:line 507, column 9
Value stored to 'ierr' is never read

Annotated Source Code

[?] Use j/k keys for keyboard navigation

1
2/*
3 Factorization code for BAIJ format.
4*/
5#include <../src/mat/impls/baij/seq/baij.h>
6#include <petsc/private/kernels/blockinvert.h>
7
8/* ------------------------------------------------------------*/
9/*
10 Version for when blocks are 4 by 4
11*/
12PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13{
14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15 IS isrow = b->row,isicol = b->icol;
16 PetscErrorCode ierr;
17 const PetscInt *r,*ic;
18 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19 PetscInt *ajtmpold,*ajtmp,nz,row;
20 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
21 MatScalar *pv,*v,*rtmp,*pc,*w,*x;
22 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25 MatScalar m13,m14,m15,m16;
26 MatScalar *ba = b->a,*aa = a->a;
27 PetscBool pivotinblocks = b->pivotinblocks;
28 PetscReal shift = info->shiftamount;
29 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
30
31 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 31; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
32 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),32,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
33 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),33,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
34 ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,34,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),34,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
35 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
36
37 for (i=0; i<n; i++) {
38 nz = bi[i+1] - bi[i];
39 ajtmp = bj + bi[i];
40 for (j=0; j<nz; j++) {
41 x = rtmp+16*ajtmp[j];
42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44 }
45 /* load in initial (unfactored row) */
46 idx = r[i];
47 nz = ai[idx+1] - ai[idx];
48 ajtmpold = aj + ai[idx];
49 v = aa + 16*ai[idx];
50 for (j=0; j<nz; j++) {
51 x = rtmp+16*ic[ajtmpold[j]];
52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55 x[14] = v[14]; x[15] = v[15];
56 v += 16;
57 }
58 row = *ajtmp++;
59 while (row < i) {
60 pc = rtmp + 16*row;
61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64 p15 = pc[14]; p16 = pc[15];
65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68 || p16 != 0.0) {
69 pv = ba + 16*diag_offset[row];
70 pj = bj + diag_offset[row] + 1;
71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74 x15 = pv[14]; x16 = pv[15];
75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
79
80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
84
85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
89
90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
94
95 nz = bi[row+1] - diag_offset[row] - 1;
96 pv += 16;
97 for (j=0; j<nz; j++) {
98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
102 x = rtmp + 16*pj[j];
103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
107
108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
112
113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117
118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
122
123 pv += 16;
124 }
125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),125,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
126 }
127 row = *ajtmp++;
128 }
129 /* finished row so stick it into b->a */
130 pv = ba + 16*bi[i];
131 pj = bj + bi[i];
132 nz = bi[i+1] - bi[i];
133 for (j=0; j<nz; j++) {
134 x = rtmp+16*pj[j];
135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139 pv += 16;
140 }
141 /* invert diagonal block */
142 w = ba + 16*diag_offset[i];
143 if (pivotinblocks) {
144 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),144,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
145 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146 } else {
147 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *=
-d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d
; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] +=
w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w
[2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di
; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di =
w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= -
d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di;
w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w
[9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w
[7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] *
di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 /
di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d
; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] *
di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5
] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[
14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] *
w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[
7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[
0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w
[3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w
[13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] *
di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di;
}
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),147,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
148 }
149 }
150
151 ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),151,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
) || ((rtmp) = 0,0))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),151,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),152,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),153,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
154
155 C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
156 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157 C->assembled = PETSC_TRUE;
158
159 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),159,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* from inverting diagonal blocks */
160 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
161}
162
163/* MatLUFactorNumeric_SeqBAIJ_4 -
164 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165 PetscKernel_A_gets_A_times_B()
166 PetscKernel_A_gets_A_minus_B_times_C()
167 PetscKernel_A_gets_inverse_A()
168*/
169
170PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171{
172 Mat C = B;
173 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
174 IS isrow = b->row,isicol = b->icol;
175 PetscErrorCode ierr;
176 const PetscInt *r,*ic;
177 PetscInt i,j,k,nz,nzL,row;
178 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181 PetscInt flg;
182 PetscReal shift;
183 PetscBool allowzeropivot,zeropivotdetected;
184
185 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 185; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
186 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
187 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),187,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
188 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),188,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
189
190 if (info->shifttype == MAT_SHIFT_NONE) {
191 shift = 0;
192 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
193 shift = info->shiftamount;
194 }
195
196 /* generate work space needed by the factorization */
197 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,197,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(bs2*n)*sizeof(**(&rtmp)),(&rtmp),(size_t)(bs2
)*sizeof(**(&mwork)),(&mwork))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),197,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
198 ierr = PetscArrayzero(rtmp,bs2*n)PetscMemzero(rtmp,(bs2*n)*sizeof(*(rtmp)));;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),198,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
199
200 for (i=0; i<n; i++) {
201 /* zero rtmp */
202 /* L part */
203 nz = bi[i+1] - bi[i];
204 bjtmp = bj + bi[i];
205 for (j=0; j<nz; j++) {
206 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[
j])));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),206,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
207 }
208
209 /* U part */
210 nz = bdiag[i]-bdiag[i+1];
211 bjtmp = bj + bdiag[i+1]+1;
212 for (j=0; j<nz; j++) {
213 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[
j])));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),213,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
214 }
215
216 /* load in initial (unfactored row) */
217 nz = ai[r[i]+1] - ai[r[i]];
218 ajtmp = aj + ai[r[i]];
219 v = aa + bs2*ai[r[i]];
220 for (j=0; j<nz; j++) {
221 ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2)((sizeof(*(rtmp+bs2*ic[ajtmp[j]])) != sizeof(*(v+bs2*j))) || PetscMemcpy
(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,(bs2)*sizeof(*(rtmp+bs2*ic[ajtmp
[j]]))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),221,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
222 }
223
224 /* elimination */
225 bjtmp = bj + bi[i];
226 nzL = bi[i+1] - bi[i];
227 for (k=0; k < nzL; k++) {
228 row = bjtmp[k];
229 pc = rtmp + bs2*row;
230 for (flg=0,j=0; j<bs2; j++) {
231 if (pc[j]!=0.0) {
232 flg = 1;
233 break;
234 }
235 }
236 if (flg) {
237 pv = b->a + bs2*bdiag[row];
238 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
239 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof(*(mwork)) != sizeof(*(pc))) || PetscMemcpy
(mwork,pc,(16)*sizeof(*(mwork))));;do {if (__builtin_expect(!
!(ierr),0)) return PetscError(((MPI_Comm)0x44000001),239,__func__
,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]*pv
[0] + mwork[4]*pv[1] + mwork[8]*pv[2] + mwork[12]*pv[3]; pc[1
] = mwork[1]*pv[0] + mwork[5]*pv[1] + mwork[9]*pv[2] + mwork[
13]*pv[3]; pc[2] = mwork[2]*pv[0] + mwork[6]*pv[1] + mwork[10
]*pv[2] + mwork[14]*pv[3]; pc[3] = mwork[3]*pv[0] + mwork[7]*
pv[1] + mwork[11]*pv[2] + mwork[15]*pv[3]; pc[4] = mwork[0]*pv
[4] + mwork[4]*pv[5] + mwork[8]*pv[6] + mwork[12]*pv[7]; pc[5
] = mwork[1]*pv[4] + mwork[5]*pv[5] + mwork[9]*pv[6] + mwork[
13]*pv[7]; pc[6] = mwork[2]*pv[4] + mwork[6]*pv[5] + mwork[10
]*pv[6] + mwork[14]*pv[7]; pc[7] = mwork[3]*pv[4] + mwork[7]*
pv[5] + mwork[11]*pv[6] + mwork[15]*pv[7]; pc[8] = mwork[0]*pv
[8] + mwork[4]*pv[9] + mwork[8]*pv[10] + mwork[12]*pv[11]; pc
[9] = mwork[1]*pv[8] + mwork[5]*pv[9] + mwork[9]*pv[10] + mwork
[13]*pv[11]; pc[10] = mwork[2]*pv[8] + mwork[6]*pv[9] + mwork
[10]*pv[10] + mwork[14]*pv[11]; pc[11] = mwork[3]*pv[8] + mwork
[7]*pv[9] + mwork[11]*pv[10] + mwork[15]*pv[11]; pc[12] = mwork
[0]*pv[12] + mwork[4]*pv[13] + mwork[8]*pv[14] + mwork[12]*pv
[15]; pc[13] = mwork[1]*pv[12] + mwork[5]*pv[13] + mwork[9]*pv
[14] + mwork[13]*pv[15]; pc[14] = mwork[2]*pv[12] + mwork[6]*
pv[13] + mwork[10]*pv[14] + mwork[14]*pv[15]; pc[15] = mwork[
3]*pv[12] + mwork[7]*pv[13] + mwork[11]*pv[14] + mwork[15]*pv
[15]; }
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),239,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
240
241 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
242 pv = b->a + bs2*(bdiag[row+1]+1);
243 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
244 for (j=0; j<nz; j++) {
245 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
246 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
247 v = rtmp + bs2*pj[j];
248 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]*pv[0] + pc[4]*pv[1] + pc[8]*pv[2] + pc[12]
*pv[3]; v[1] -= pc[1]*pv[0] + pc[5]*pv[1] + pc[9]*pv[2] + pc[
13]*pv[3]; v[2] -= pc[2]*pv[0] + pc[6]*pv[1] + pc[10]*pv[2] +
pc[14]*pv[3]; v[3] -= pc[3]*pv[0] + pc[7]*pv[1] + pc[11]*pv[
2] + pc[15]*pv[3]; v[4] -= pc[0]*pv[4] + pc[4]*pv[5] + pc[8]*
pv[6] + pc[12]*pv[7]; v[5] -= pc[1]*pv[4] + pc[5]*pv[5] + pc[
9]*pv[6] + pc[13]*pv[7]; v[6] -= pc[2]*pv[4] + pc[6]*pv[5] + pc
[10]*pv[6] + pc[14]*pv[7]; v[7] -= pc[3]*pv[4] + pc[7]*pv[5] +
pc[11]*pv[6] + pc[15]*pv[7]; v[8] -= pc[0]*pv[8] + pc[4]*pv[
9] + pc[8]*pv[10] + pc[12]*pv[11]; v[9] -= pc[1]*pv[8] + pc[5
]*pv[9] + pc[9]*pv[10] + pc[13]*pv[11]; v[10] -= pc[2]*pv[8] +
pc[6]*pv[9] + pc[10]*pv[10] + pc[14]*pv[11]; v[11] -= pc[3]*
pv[8] + pc[7]*pv[9] + pc[11]*pv[10] + pc[15]*pv[11]; v[12] -=
pc[0]*pv[12] + pc[4]*pv[13] + pc[8]*pv[14] + pc[12]*pv[15]; v
[13] -= pc[1]*pv[12] + pc[5]*pv[13] + pc[9]*pv[14] + pc[13]*pv
[15]; v[14] -= pc[2]*pv[12] + pc[6]*pv[13] + pc[10]*pv[14] + pc
[14]*pv[15]; v[15] -= pc[3]*pv[12] + pc[7]*pv[13] + pc[11]*pv
[14] + pc[15]*pv[15]; }
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),248,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
249 pv += bs2;
250 }
251 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),251,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
252 }
253 }
254
255 /* finished row so stick it into b->a */
256 /* L part */
257 pv = b->a + bs2*bi[i];
258 pj = b->j + bi[i];
259 nz = bi[i+1] - bi[i];
260 for (j=0; j<nz; j++) {
261 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy
(pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),261,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
262 }
263
264 /* Mark diagonal and invert diagonal for simplier triangular solves */
265 pv = b->a + bs2*bdiag[i];
266 pj = b->j + bdiag[i];
267 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)((sizeof(*(pv)) != sizeof(*(rtmp+bs2*pj[0]))) || PetscMemcpy(
pv,rtmp+bs2*pj[0],(bs2)*sizeof(*(pv))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),267,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
268 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),268,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
269 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
270
271 /* U part */
272 pv = b->a + bs2*(bdiag[i+1]+1);
273 pj = b->j + bdiag[i+1]+1;
274 nz = bdiag[i] - bdiag[i+1] - 1;
275 for (j=0; j<nz; j++) {
276 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy
(pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),276,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
277 }
278 }
279
280 ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,280,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,&(rtmp),&(mwork))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),280,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
281 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),281,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
282 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),282,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
283
284 C->ops->solve = MatSolve_SeqBAIJ_4;
285 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
286 C->assembled = PETSC_TRUE;
287
288 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),288,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* from inverting diagonal blocks */
289 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
290}
291
292PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293{
294/*
295 Default Version for when blocks are 4 by 4 Using natural ordering
296*/
297 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
298 PetscErrorCode ierr;
299 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300 PetscInt *ajtmpold,*ajtmp,nz,row;
301 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302 MatScalar *pv,*v,*rtmp,*pc,*w,*x;
303 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306 MatScalar m13,m14,m15,m16;
307 MatScalar *ba = b->a,*aa = a->a;
308 PetscBool pivotinblocks = b->pivotinblocks;
309 PetscReal shift = info->shiftamount;
310 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
311
312 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 312; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
313 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
314 ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,314,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),314,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
315
316 for (i=0; i<n; i++) {
317 nz = bi[i+1] - bi[i];
318 ajtmp = bj + bi[i];
319 for (j=0; j<nz; j++) {
320 x = rtmp+16*ajtmp[j];
321 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
322 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
323 }
324 /* load in initial (unfactored row) */
325 nz = ai[i+1] - ai[i];
326 ajtmpold = aj + ai[i];
327 v = aa + 16*ai[i];
328 for (j=0; j<nz; j++) {
329 x = rtmp+16*ajtmpold[j];
330 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
331 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
332 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
333 x[14] = v[14]; x[15] = v[15];
334 v += 16;
335 }
336 row = *ajtmp++;
337 while (row < i) {
338 pc = rtmp + 16*row;
339 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
340 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
341 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
342 p15 = pc[14]; p16 = pc[15];
343 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
344 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
345 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
346 || p16 != 0.0) {
347 pv = ba + 16*diag_offset[row];
348 pj = bj + diag_offset[row] + 1;
349 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
350 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
351 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
352 x15 = pv[14]; x16 = pv[15];
353 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
354 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
355 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
356 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
357
358 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
359 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
360 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
361 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
362
363 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
364 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
365 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
366 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
367
368 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
369 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
370 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
371 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
372 nz = bi[row+1] - diag_offset[row] - 1;
373 pv += 16;
374 for (j=0; j<nz; j++) {
375 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
376 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
377 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
378 x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
379 x = rtmp + 16*pj[j];
380 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
381 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
382 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
383 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
384
385 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
386 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
387 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
388 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
389
390 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
391 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
392 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
393 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
394
395 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
396 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
397 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
398 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
399
400 pv += 16;
401 }
402 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),402,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
403 }
404 row = *ajtmp++;
405 }
406 /* finished row so stick it into b->a */
407 pv = ba + 16*bi[i];
408 pj = bj + bi[i];
409 nz = bi[i+1] - bi[i];
410 for (j=0; j<nz; j++) {
411 x = rtmp+16*pj[j];
412 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
413 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
414 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
415 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
416 pv += 16;
417 }
418 /* invert diagonal block */
419 w = ba + 16*diag_offset[i];
420 if (pivotinblocks) {
421 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),421,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
422 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
423 } else {
424 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *=
-d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d
; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] +=
w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w
[2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di
; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di =
w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= -
d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di;
w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w
[9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w
[7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] *
di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 /
di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d
; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] *
di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5
] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[
14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] *
w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[
7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[
0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w
[3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w
[13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] *
di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di;
}
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),424,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
425 }
426 }
427
428 ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),428,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
) || ((rtmp) = 0,0))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),428,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
429
430 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
431 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
432 C->assembled = PETSC_TRUE;
433
434 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),434,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* from inverting diagonal blocks */
435 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
436}
437
438/*
439 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
440 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
441*/
442PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
443{
444 Mat C =B;
445 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
446 PetscErrorCode ierr;
447 PetscInt i,j,k,nz,nzL,row;
448 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
449 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
450 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
451 PetscInt flg;
452 PetscReal shift;
453 PetscBool allowzeropivot,zeropivotdetected;
454
455 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 455; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
456 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
457
458 /* generate work space needed by the factorization */
459 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,459,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(bs2*n)*sizeof(**(&rtmp)),(&rtmp),(size_t)(bs2
)*sizeof(**(&mwork)),(&mwork))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),459,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
460 ierr = PetscArrayzero(rtmp,bs2*n)PetscMemzero(rtmp,(bs2*n)*sizeof(*(rtmp)));;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),460,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
461
462 if (info->shifttype == MAT_SHIFT_NONE) {
463 shift = 0;
464 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
465 shift = info->shiftamount;
466 }
467
468 for (i=0; i<n; i++) {
469 /* zero rtmp */
470 /* L part */
471 nz = bi[i+1] - bi[i];
472 bjtmp = bj + bi[i];
473 for (j=0; j<nz; j++) {
474 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[
j])));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),474,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
475 }
476
477 /* U part */
478 nz = bdiag[i] - bdiag[i+1];
479 bjtmp = bj + bdiag[i+1]+1;
480 for (j=0; j<nz; j++) {
481 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[
j])));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),481,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
482 }
483
484 /* load in initial (unfactored row) */
485 nz = ai[i+1] - ai[i];
486 ajtmp = aj + ai[i];
487 v = aa + bs2*ai[i];
488 for (j=0; j<nz; j++) {
489 ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2)((sizeof(*(rtmp+bs2*ajtmp[j])) != sizeof(*(v+bs2*j))) || PetscMemcpy
(rtmp+bs2*ajtmp[j],v+bs2*j,(bs2)*sizeof(*(rtmp+bs2*ajtmp[j]))
));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),489,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
490 }
491
492 /* elimination */
493 bjtmp = bj + bi[i];
494 nzL = bi[i+1] - bi[i];
495 for (k=0; k < nzL; k++) {
496 row = bjtmp[k];
497 pc = rtmp + bs2*row;
498 for (flg=0,j=0; j<bs2; j++) {
499 if (pc[j]!=0.0) {
500 flg = 1;
501 break;
502 }
503 }
504 if (flg) {
505 pv = b->a + bs2*bdiag[row];
506 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
507 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof(*(mwork)) != sizeof(*(pc))) || PetscMemcpy
(mwork,pc,(16)*sizeof(*(mwork))));;do {if (__builtin_expect(!
!(ierr),0)) return PetscError(((MPI_Comm)0x44000001),507,__func__
,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]*pv
[0] + mwork[4]*pv[1] + mwork[8]*pv[2] + mwork[12]*pv[3]; pc[1
] = mwork[1]*pv[0] + mwork[5]*pv[1] + mwork[9]*pv[2] + mwork[
13]*pv[3]; pc[2] = mwork[2]*pv[0] + mwork[6]*pv[1] + mwork[10
]*pv[2] + mwork[14]*pv[3]; pc[3] = mwork[3]*pv[0] + mwork[7]*
pv[1] + mwork[11]*pv[2] + mwork[15]*pv[3]; pc[4] = mwork[0]*pv
[4] + mwork[4]*pv[5] + mwork[8]*pv[6] + mwork[12]*pv[7]; pc[5
] = mwork[1]*pv[4] + mwork[5]*pv[5] + mwork[9]*pv[6] + mwork[
13]*pv[7]; pc[6] = mwork[2]*pv[4] + mwork[6]*pv[5] + mwork[10
]*pv[6] + mwork[14]*pv[7]; pc[7] = mwork[3]*pv[4] + mwork[7]*
pv[5] + mwork[11]*pv[6] + mwork[15]*pv[7]; pc[8] = mwork[0]*pv
[8] + mwork[4]*pv[9] + mwork[8]*pv[10] + mwork[12]*pv[11]; pc
[9] = mwork[1]*pv[8] + mwork[5]*pv[9] + mwork[9]*pv[10] + mwork
[13]*pv[11]; pc[10] = mwork[2]*pv[8] + mwork[6]*pv[9] + mwork
[10]*pv[10] + mwork[14]*pv[11]; pc[11] = mwork[3]*pv[8] + mwork
[7]*pv[9] + mwork[11]*pv[10] + mwork[15]*pv[11]; pc[12] = mwork
[0]*pv[12] + mwork[4]*pv[13] + mwork[8]*pv[14] + mwork[12]*pv
[15]; pc[13] = mwork[1]*pv[12] + mwork[5]*pv[13] + mwork[9]*pv
[14] + mwork[13]*pv[15]; pc[14] = mwork[2]*pv[12] + mwork[6]*
pv[13] + mwork[10]*pv[14] + mwork[14]*pv[15]; pc[15] = mwork[
3]*pv[12] + mwork[7]*pv[13] + mwork[11]*pv[14] + mwork[15]*pv
[15]; }
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),507,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
Value stored to 'ierr' is never read
508
509 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
510 pv = b->a + bs2*(bdiag[row+1]+1);
511 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
512 for (j=0; j<nz; j++) {
513 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
514 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
515 v = rtmp + bs2*pj[j];
516 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]*pv[0] + pc[4]*pv[1] + pc[8]*pv[2] + pc[12]
*pv[3]; v[1] -= pc[1]*pv[0] + pc[5]*pv[1] + pc[9]*pv[2] + pc[
13]*pv[3]; v[2] -= pc[2]*pv[0] + pc[6]*pv[1] + pc[10]*pv[2] +
pc[14]*pv[3]; v[3] -= pc[3]*pv[0] + pc[7]*pv[1] + pc[11]*pv[
2] + pc[15]*pv[3]; v[4] -= pc[0]*pv[4] + pc[4]*pv[5] + pc[8]*
pv[6] + pc[12]*pv[7]; v[5] -= pc[1]*pv[4] + pc[5]*pv[5] + pc[
9]*pv[6] + pc[13]*pv[7]; v[6] -= pc[2]*pv[4] + pc[6]*pv[5] + pc
[10]*pv[6] + pc[14]*pv[7]; v[7] -= pc[3]*pv[4] + pc[7]*pv[5] +
pc[11]*pv[6] + pc[15]*pv[7]; v[8] -= pc[0]*pv[8] + pc[4]*pv[
9] + pc[8]*pv[10] + pc[12]*pv[11]; v[9] -= pc[1]*pv[8] + pc[5
]*pv[9] + pc[9]*pv[10] + pc[13]*pv[11]; v[10] -= pc[2]*pv[8] +
pc[6]*pv[9] + pc[10]*pv[10] + pc[14]*pv[11]; v[11] -= pc[3]*
pv[8] + pc[7]*pv[9] + pc[11]*pv[10] + pc[15]*pv[11]; v[12] -=
pc[0]*pv[12] + pc[4]*pv[13] + pc[8]*pv[14] + pc[12]*pv[15]; v
[13] -= pc[1]*pv[12] + pc[5]*pv[13] + pc[9]*pv[14] + pc[13]*pv
[15]; v[14] -= pc[2]*pv[12] + pc[6]*pv[13] + pc[10]*pv[14] + pc
[14]*pv[15]; v[15] -= pc[3]*pv[12] + pc[7]*pv[13] + pc[11]*pv
[14] + pc[15]*pv[15]; }
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),516,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
517 pv += bs2;
518 }
519 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),519,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
520 }
521 }
522
523 /* finished row so stick it into b->a */
524 /* L part */
525 pv = b->a + bs2*bi[i];
526 pj = b->j + bi[i];
527 nz = bi[i+1] - bi[i];
528 for (j=0; j<nz; j++) {
529 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy
(pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),529,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
530 }
531
532 /* Mark diagonal and invert diagonal for simplier triangular solves */
533 pv = b->a + bs2*bdiag[i];
534 pj = b->j + bdiag[i];
535 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)((sizeof(*(pv)) != sizeof(*(rtmp+bs2*pj[0]))) || PetscMemcpy(
pv,rtmp+bs2*pj[0],(bs2)*sizeof(*(pv))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),535,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
536 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),536,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
537 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
538
539 /* U part */
540 pv = b->a + bs2*(bdiag[i+1]+1);
541 pj = b->j + bdiag[i+1]+1;
542 nz = bdiag[i] - bdiag[i+1] - 1;
543 for (j=0; j<nz; j++) {
544 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy
(pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),544,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
545 }
546 }
547 ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,547,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,&(rtmp),&(mwork))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),547,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
548
549 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
550 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
551 C->assembled = PETSC_TRUE;
552
553 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),553,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
; /* from inverting diagonal blocks */
554 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
555}
556
557#if defined(PETSC_HAVE_SSE)
558
559#include PETSC_HAVE_SSE
560
561/* SSE Version for when blocks are 4 by 4 Using natural ordering */
562PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
563{
564 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
565 PetscErrorCode ierr;
566 int i,j,n = a->mbs;
567 int *bj = b->j,*bjtmp,*pj;
568 int row;
569 int *ajtmpold,nz,*bi=b->i;
570 int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
571 MatScalar *pv,*v,*rtmp,*pc,*w,*x;
572 MatScalar *ba = b->a,*aa = a->a;
573 int nonzero=0;
574 PetscBool pivotinblocks = b->pivotinblocks;
575 PetscReal shift = info->shiftamount;
576 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
577
578 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 578; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
579 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
580 SSE_SCOPE_BEGIN;
581
582 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),582,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work."
)
;
583 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),583,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work."
)
;
584 ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,584,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),584,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
585 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),585,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work."
)
;
586/* if ((unsigned long)bj==(unsigned long)aj) { */
587/* colscale = 4; */
588/* } */
589 for (i=0; i<n; i++) {
590 nz = bi[i+1] - bi[i];
591 bjtmp = bj + bi[i];
592 /* zero out the 4x4 block accumulators */
593 /* zero out one register */
594 XOR_PS(XMM7,XMM7);
595 for (j=0; j<nz; j++) {
596 x = rtmp+16*bjtmp[j];
597/* x = rtmp+4*bjtmp[j]; */
598 SSE_INLINE_BEGIN_1(x)
599 /* Copy zero register to memory locations */
600 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
602 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
603 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
604 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
605 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
606 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
607 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
608 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
609 SSE_INLINE_END_1;
610 }
611 /* load in initial (unfactored row) */
612 nz = ai[i+1] - ai[i];
613 ajtmpold = aj + ai[i];
614 v = aa + 16*ai[i];
615 for (j=0; j<nz; j++) {
616 x = rtmp+16*ajtmpold[j];
617/* x = rtmp+colscale*ajtmpold[j]; */
618 /* Copy v block into x block */
619 SSE_INLINE_BEGIN_2(v,x)
620 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
623
624 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
625 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
626
627 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
628 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
629
630 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
631 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
632
633 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
634 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
635
636 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
637 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
638
639 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
640 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
641
642 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
643 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
644 SSE_INLINE_END_2;
645
646 v += 16;
647 }
648/* row = (*bjtmp++)/4; */
649 row = *bjtmp++;
650 while (row < i) {
651 pc = rtmp + 16*row;
652 SSE_INLINE_BEGIN_1(pc)
653 /* Load block from lower triangle */
654 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
657
658 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
659 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
660
661 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
662 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
663
664 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
665 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
666
667 /* Compare block to zero block */
668
669 SSE_COPY_PS(XMM4,XMM7)
670 SSE_CMPNEQ_PS(XMM4,XMM0)
671
672 SSE_COPY_PS(XMM5,XMM7)
673 SSE_CMPNEQ_PS(XMM5,XMM1)
674
675 SSE_COPY_PS(XMM6,XMM7)
676 SSE_CMPNEQ_PS(XMM6,XMM2)
677
678 SSE_CMPNEQ_PS(XMM7,XMM3)
679
680 /* Reduce the comparisons to one SSE register */
681 SSE_OR_PS(XMM6,XMM7)
682 SSE_OR_PS(XMM5,XMM4)
683 SSE_OR_PS(XMM5,XMM6)
684 SSE_INLINE_END_1;
685
686 /* Reduce the one SSE register to an integer register for branching */
687 /* Note: Since nonzero is an int, there is no INLINE block version of this call */
688 MOVEMASK(nonzero,XMM5);
689
690 /* If block is nonzero ... */
691 if (nonzero) {
692 pv = ba + 16*diag_offset[row];
693 PREFETCH_L1(&pv[16]);
694 pj = bj + diag_offset[row] + 1;
695
696 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
697 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
698 /* but the diagonal was inverted already */
699 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
700
701 SSE_INLINE_BEGIN_2(pv,pc)
702 /* Column 0, product is accumulated in XMM4 */
703 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
704 SSE_SHUFFLE(XMM4,XMM4,0x00)
705 SSE_MULT_PS(XMM4,XMM0)
706
707 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
708 SSE_SHUFFLE(XMM5,XMM5,0x00)
709 SSE_MULT_PS(XMM5,XMM1)
710 SSE_ADD_PS(XMM4,XMM5)
711
712 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
713 SSE_SHUFFLE(XMM6,XMM6,0x00)
714 SSE_MULT_PS(XMM6,XMM2)
715 SSE_ADD_PS(XMM4,XMM6)
716
717 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
718 SSE_SHUFFLE(XMM7,XMM7,0x00)
719 SSE_MULT_PS(XMM7,XMM3)
720 SSE_ADD_PS(XMM4,XMM7)
721
722 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
723 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
724
725 /* Column 1, product is accumulated in XMM5 */
726 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
727 SSE_SHUFFLE(XMM5,XMM5,0x00)
728 SSE_MULT_PS(XMM5,XMM0)
729
730 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
731 SSE_SHUFFLE(XMM6,XMM6,0x00)
732 SSE_MULT_PS(XMM6,XMM1)
733 SSE_ADD_PS(XMM5,XMM6)
734
735 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
736 SSE_SHUFFLE(XMM7,XMM7,0x00)
737 SSE_MULT_PS(XMM7,XMM2)
738 SSE_ADD_PS(XMM5,XMM7)
739
740 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
741 SSE_SHUFFLE(XMM6,XMM6,0x00)
742 SSE_MULT_PS(XMM6,XMM3)
743 SSE_ADD_PS(XMM5,XMM6)
744
745 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
746 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
747
748 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
749
750 /* Column 2, product is accumulated in XMM6 */
751 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
752 SSE_SHUFFLE(XMM6,XMM6,0x00)
753 SSE_MULT_PS(XMM6,XMM0)
754
755 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
756 SSE_SHUFFLE(XMM7,XMM7,0x00)
757 SSE_MULT_PS(XMM7,XMM1)
758 SSE_ADD_PS(XMM6,XMM7)
759
760 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
761 SSE_SHUFFLE(XMM7,XMM7,0x00)
762 SSE_MULT_PS(XMM7,XMM2)
763 SSE_ADD_PS(XMM6,XMM7)
764
765 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
766 SSE_SHUFFLE(XMM7,XMM7,0x00)
767 SSE_MULT_PS(XMM7,XMM3)
768 SSE_ADD_PS(XMM6,XMM7)
769
770 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
771 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
772
773 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
774 /* Column 3, product is accumulated in XMM0 */
775 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
776 SSE_SHUFFLE(XMM7,XMM7,0x00)
777 SSE_MULT_PS(XMM0,XMM7)
778
779 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
780 SSE_SHUFFLE(XMM7,XMM7,0x00)
781 SSE_MULT_PS(XMM1,XMM7)
782 SSE_ADD_PS(XMM0,XMM1)
783
784 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
785 SSE_SHUFFLE(XMM1,XMM1,0x00)
786 SSE_MULT_PS(XMM1,XMM2)
787 SSE_ADD_PS(XMM0,XMM1)
788
789 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
790 SSE_SHUFFLE(XMM7,XMM7,0x00)
791 SSE_MULT_PS(XMM3,XMM7)
792 SSE_ADD_PS(XMM0,XMM3)
793
794 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
795 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
796
797 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
798 /* This is code to be maintained and read by humans afterall. */
799 /* Copy Multiplier Col 3 into XMM3 */
800 SSE_COPY_PS(XMM3,XMM0)
801 /* Copy Multiplier Col 2 into XMM2 */
802 SSE_COPY_PS(XMM2,XMM6)
803 /* Copy Multiplier Col 1 into XMM1 */
804 SSE_COPY_PS(XMM1,XMM5)
805 /* Copy Multiplier Col 0 into XMM0 */
806 SSE_COPY_PS(XMM0,XMM4)
807 SSE_INLINE_END_2;
808
809 /* Update the row: */
810 nz = bi[row+1] - diag_offset[row] - 1;
811 pv += 16;
812 for (j=0; j<nz; j++) {
813 PREFETCH_L1(&pv[16]);
814 x = rtmp + 16*pj[j];
815/* x = rtmp + 4*pj[j]; */
816
817 /* X:=X-M*PV, One column at a time */
818 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
819 SSE_INLINE_BEGIN_2(x,pv)
820 /* Load First Column of X*/
821 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
822 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
823
824 /* Matrix-Vector Product: */
825 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
826 SSE_SHUFFLE(XMM5,XMM5,0x00)
827 SSE_MULT_PS(XMM5,XMM0)
828 SSE_SUB_PS(XMM4,XMM5)
829
830 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
831 SSE_SHUFFLE(XMM6,XMM6,0x00)
832 SSE_MULT_PS(XMM6,XMM1)
833 SSE_SUB_PS(XMM4,XMM6)
834
835 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
836 SSE_SHUFFLE(XMM7,XMM7,0x00)
837 SSE_MULT_PS(XMM7,XMM2)
838 SSE_SUB_PS(XMM4,XMM7)
839
840 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
841 SSE_SHUFFLE(XMM5,XMM5,0x00)
842 SSE_MULT_PS(XMM5,XMM3)
843 SSE_SUB_PS(XMM4,XMM5)
844
845 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
846 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
847
848 /* Second Column */
849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
850 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
851
852 /* Matrix-Vector Product: */
853 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
854 SSE_SHUFFLE(XMM6,XMM6,0x00)
855 SSE_MULT_PS(XMM6,XMM0)
856 SSE_SUB_PS(XMM5,XMM6)
857
858 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
859 SSE_SHUFFLE(XMM7,XMM7,0x00)
860 SSE_MULT_PS(XMM7,XMM1)
861 SSE_SUB_PS(XMM5,XMM7)
862
863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
864 SSE_SHUFFLE(XMM4,XMM4,0x00)
865 SSE_MULT_PS(XMM4,XMM2)
866 SSE_SUB_PS(XMM5,XMM4)
867
868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
869 SSE_SHUFFLE(XMM6,XMM6,0x00)
870 SSE_MULT_PS(XMM6,XMM3)
871 SSE_SUB_PS(XMM5,XMM6)
872
873 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
874 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
875
876 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
877
878 /* Third Column */
879 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
880 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
881
882 /* Matrix-Vector Product: */
883 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
884 SSE_SHUFFLE(XMM7,XMM7,0x00)
885 SSE_MULT_PS(XMM7,XMM0)
886 SSE_SUB_PS(XMM6,XMM7)
887
888 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
889 SSE_SHUFFLE(XMM4,XMM4,0x00)
890 SSE_MULT_PS(XMM4,XMM1)
891 SSE_SUB_PS(XMM6,XMM4)
892
893 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
894 SSE_SHUFFLE(XMM5,XMM5,0x00)
895 SSE_MULT_PS(XMM5,XMM2)
896 SSE_SUB_PS(XMM6,XMM5)
897
898 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
899 SSE_SHUFFLE(XMM7,XMM7,0x00)
900 SSE_MULT_PS(XMM7,XMM3)
901 SSE_SUB_PS(XMM6,XMM7)
902
903 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
904 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
905
906 /* Fourth Column */
907 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
908 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
909
910 /* Matrix-Vector Product: */
911 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
912 SSE_SHUFFLE(XMM5,XMM5,0x00)
913 SSE_MULT_PS(XMM5,XMM0)
914 SSE_SUB_PS(XMM4,XMM5)
915
916 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
917 SSE_SHUFFLE(XMM6,XMM6,0x00)
918 SSE_MULT_PS(XMM6,XMM1)
919 SSE_SUB_PS(XMM4,XMM6)
920
921 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
922 SSE_SHUFFLE(XMM7,XMM7,0x00)
923 SSE_MULT_PS(XMM7,XMM2)
924 SSE_SUB_PS(XMM4,XMM7)
925
926 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
927 SSE_SHUFFLE(XMM5,XMM5,0x00)
928 SSE_MULT_PS(XMM5,XMM3)
929 SSE_SUB_PS(XMM4,XMM5)
930
931 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
932 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
933 SSE_INLINE_END_2;
934 pv += 16;
935 }
936 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),936,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
937 }
938 row = *bjtmp++;
939/* row = (*bjtmp++)/4; */
940 }
941 /* finished row so stick it into b->a */
942 pv = ba + 16*bi[i];
943 pj = bj + bi[i];
944 nz = bi[i+1] - bi[i];
945
946 /* Copy x block back into pv block */
947 for (j=0; j<nz; j++) {
948 x = rtmp+16*pj[j];
949/* x = rtmp+4*pj[j]; */
950
951 SSE_INLINE_BEGIN_2(x,pv)
952 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
953 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
954 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
955
956 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
957 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
958
959 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
960 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
961
962 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
963 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
964
965 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
966 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
967
968 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
969 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
970
971 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
972 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
973
974 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
975 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
976 SSE_INLINE_END_2;
977 pv += 16;
978 }
979 /* invert diagonal block */
980 w = ba + 16*diag_offset[i];
981 if (pivotinblocks) {
982 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),982,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
983 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
984 } else {
985 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *=
-d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d
; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] +=
w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w
[2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di
; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di =
w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= -
d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di;
w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w
[9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w
[7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] *
di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 /
di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d
; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] *
di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5
] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[
14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] *
w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[
7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[
0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w
[3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w
[13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] *
di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di;
}
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),985,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
986 }
987/* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
988 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
989 }
990
991 ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),991,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
) || ((rtmp) = 0,0))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),991,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
992
993 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
994 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
995 C->assembled = PETSC_TRUE;
996
997 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),997,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
998 /* Flop Count from inverting diagonal blocks */
999 SSE_SCOPE_END;
1000 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
1001}
1002
1003PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1004{
1005 Mat A =C;
1006 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1007 PetscErrorCode ierr;
1008 int i,j,n = a->mbs;
1009 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1010 unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1011 unsigned int row;
1012 int nz,*bi=b->i;
1013 int *diag_offset = b->diag,*ai=a->i;
1014 MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1015 MatScalar *ba = b->a,*aa = a->a;
1016 int nonzero=0;
1017 PetscBool pivotinblocks = b->pivotinblocks;
1018 PetscReal shift = info->shiftamount;
1019 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1020
1021 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 1021; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
1022 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
1023 SSE_SCOPE_BEGIN;
1024
1025 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1025,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work."
)
;
1026 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1026,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work."
)
;
1027 ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1027,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1027,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1028 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1028,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work."
)
;
1029/* if ((unsigned long)bj==(unsigned long)aj) { */
1030/* colscale = 4; */
1031/* } */
1032
1033 for (i=0; i<n; i++) {
1034 nz = bi[i+1] - bi[i];
1035 bjtmp = bj + bi[i];
1036 /* zero out the 4x4 block accumulators */
1037 /* zero out one register */
1038 XOR_PS(XMM7,XMM7);
1039 for (j=0; j<nz; j++) {
1040 x = rtmp+16*((unsigned int)bjtmp[j]);
1041/* x = rtmp+4*bjtmp[j]; */
1042 SSE_INLINE_BEGIN_1(x)
1043 /* Copy zero register to memory locations */
1044 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1045 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1046 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1047 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1048 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1049 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1050 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1051 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1052 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1053 SSE_INLINE_END_1;
1054 }
1055 /* load in initial (unfactored row) */
1056 nz = ai[i+1] - ai[i];
1057 ajtmp = aj + ai[i];
1058 v = aa + 16*ai[i];
1059 for (j=0; j<nz; j++) {
1060 x = rtmp+16*((unsigned int)ajtmp[j]);
1061/* x = rtmp+colscale*ajtmp[j]; */
1062 /* Copy v block into x block */
1063 SSE_INLINE_BEGIN_2(v,x)
1064 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1067
1068 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1069 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1070
1071 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1072 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1073
1074 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1075 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1076
1077 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1078 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1079
1080 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1081 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1082
1083 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1084 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1085
1086 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1087 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1088 SSE_INLINE_END_2;
1089
1090 v += 16;
1091 }
1092/* row = (*bjtmp++)/4; */
1093 row = (unsigned int)(*bjtmp++);
1094 while (row < i) {
1095 pc = rtmp + 16*row;
1096 SSE_INLINE_BEGIN_1(pc)
1097 /* Load block from lower triangle */
1098 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1099 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1100 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1101
1102 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1103 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1104
1105 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1106 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1107
1108 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1109 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1110
1111 /* Compare block to zero block */
1112
1113 SSE_COPY_PS(XMM4,XMM7)
1114 SSE_CMPNEQ_PS(XMM4,XMM0)
1115
1116 SSE_COPY_PS(XMM5,XMM7)
1117 SSE_CMPNEQ_PS(XMM5,XMM1)
1118
1119 SSE_COPY_PS(XMM6,XMM7)
1120 SSE_CMPNEQ_PS(XMM6,XMM2)
1121
1122 SSE_CMPNEQ_PS(XMM7,XMM3)
1123
1124 /* Reduce the comparisons to one SSE register */
1125 SSE_OR_PS(XMM6,XMM7)
1126 SSE_OR_PS(XMM5,XMM4)
1127 SSE_OR_PS(XMM5,XMM6)
1128 SSE_INLINE_END_1;
1129
1130 /* Reduce the one SSE register to an integer register for branching */
1131 /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1132 MOVEMASK(nonzero,XMM5);
1133
1134 /* If block is nonzero ... */
1135 if (nonzero) {
1136 pv = ba + 16*diag_offset[row];
1137 PREFETCH_L1(&pv[16]);
1138 pj = bj + diag_offset[row] + 1;
1139
1140 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1141 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1142 /* but the diagonal was inverted already */
1143 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1144
1145 SSE_INLINE_BEGIN_2(pv,pc)
1146 /* Column 0, product is accumulated in XMM4 */
1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1148 SSE_SHUFFLE(XMM4,XMM4,0x00)
1149 SSE_MULT_PS(XMM4,XMM0)
1150
1151 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1152 SSE_SHUFFLE(XMM5,XMM5,0x00)
1153 SSE_MULT_PS(XMM5,XMM1)
1154 SSE_ADD_PS(XMM4,XMM5)
1155
1156 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1157 SSE_SHUFFLE(XMM6,XMM6,0x00)
1158 SSE_MULT_PS(XMM6,XMM2)
1159 SSE_ADD_PS(XMM4,XMM6)
1160
1161 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1162 SSE_SHUFFLE(XMM7,XMM7,0x00)
1163 SSE_MULT_PS(XMM7,XMM3)
1164 SSE_ADD_PS(XMM4,XMM7)
1165
1166 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1167 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1168
1169 /* Column 1, product is accumulated in XMM5 */
1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1171 SSE_SHUFFLE(XMM5,XMM5,0x00)
1172 SSE_MULT_PS(XMM5,XMM0)
1173
1174 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1175 SSE_SHUFFLE(XMM6,XMM6,0x00)
1176 SSE_MULT_PS(XMM6,XMM1)
1177 SSE_ADD_PS(XMM5,XMM6)
1178
1179 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1180 SSE_SHUFFLE(XMM7,XMM7,0x00)
1181 SSE_MULT_PS(XMM7,XMM2)
1182 SSE_ADD_PS(XMM5,XMM7)
1183
1184 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1185 SSE_SHUFFLE(XMM6,XMM6,0x00)
1186 SSE_MULT_PS(XMM6,XMM3)
1187 SSE_ADD_PS(XMM5,XMM6)
1188
1189 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1190 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1191
1192 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1193
1194 /* Column 2, product is accumulated in XMM6 */
1195 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1196 SSE_SHUFFLE(XMM6,XMM6,0x00)
1197 SSE_MULT_PS(XMM6,XMM0)
1198
1199 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1200 SSE_SHUFFLE(XMM7,XMM7,0x00)
1201 SSE_MULT_PS(XMM7,XMM1)
1202 SSE_ADD_PS(XMM6,XMM7)
1203
1204 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1205 SSE_SHUFFLE(XMM7,XMM7,0x00)
1206 SSE_MULT_PS(XMM7,XMM2)
1207 SSE_ADD_PS(XMM6,XMM7)
1208
1209 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1210 SSE_SHUFFLE(XMM7,XMM7,0x00)
1211 SSE_MULT_PS(XMM7,XMM3)
1212 SSE_ADD_PS(XMM6,XMM7)
1213
1214 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1215 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1216
1217 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1218 /* Column 3, product is accumulated in XMM0 */
1219 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1220 SSE_SHUFFLE(XMM7,XMM7,0x00)
1221 SSE_MULT_PS(XMM0,XMM7)
1222
1223 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1224 SSE_SHUFFLE(XMM7,XMM7,0x00)
1225 SSE_MULT_PS(XMM1,XMM7)
1226 SSE_ADD_PS(XMM0,XMM1)
1227
1228 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1229 SSE_SHUFFLE(XMM1,XMM1,0x00)
1230 SSE_MULT_PS(XMM1,XMM2)
1231 SSE_ADD_PS(XMM0,XMM1)
1232
1233 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1234 SSE_SHUFFLE(XMM7,XMM7,0x00)
1235 SSE_MULT_PS(XMM3,XMM7)
1236 SSE_ADD_PS(XMM0,XMM3)
1237
1238 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1239 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1240
1241 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1242 /* This is code to be maintained and read by humans afterall. */
1243 /* Copy Multiplier Col 3 into XMM3 */
1244 SSE_COPY_PS(XMM3,XMM0)
1245 /* Copy Multiplier Col 2 into XMM2 */
1246 SSE_COPY_PS(XMM2,XMM6)
1247 /* Copy Multiplier Col 1 into XMM1 */
1248 SSE_COPY_PS(XMM1,XMM5)
1249 /* Copy Multiplier Col 0 into XMM0 */
1250 SSE_COPY_PS(XMM0,XMM4)
1251 SSE_INLINE_END_2;
1252
1253 /* Update the row: */
1254 nz = bi[row+1] - diag_offset[row] - 1;
1255 pv += 16;
1256 for (j=0; j<nz; j++) {
1257 PREFETCH_L1(&pv[16]);
1258 x = rtmp + 16*((unsigned int)pj[j]);
1259/* x = rtmp + 4*pj[j]; */
1260
1261 /* X:=X-M*PV, One column at a time */
1262 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1263 SSE_INLINE_BEGIN_2(x,pv)
1264 /* Load First Column of X*/
1265 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1267
1268 /* Matrix-Vector Product: */
1269 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1270 SSE_SHUFFLE(XMM5,XMM5,0x00)
1271 SSE_MULT_PS(XMM5,XMM0)
1272 SSE_SUB_PS(XMM4,XMM5)
1273
1274 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1275 SSE_SHUFFLE(XMM6,XMM6,0x00)
1276 SSE_MULT_PS(XMM6,XMM1)
1277 SSE_SUB_PS(XMM4,XMM6)
1278
1279 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1280 SSE_SHUFFLE(XMM7,XMM7,0x00)
1281 SSE_MULT_PS(XMM7,XMM2)
1282 SSE_SUB_PS(XMM4,XMM7)
1283
1284 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1285 SSE_SHUFFLE(XMM5,XMM5,0x00)
1286 SSE_MULT_PS(XMM5,XMM3)
1287 SSE_SUB_PS(XMM4,XMM5)
1288
1289 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1290 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1291
1292 /* Second Column */
1293 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1295
1296 /* Matrix-Vector Product: */
1297 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1298 SSE_SHUFFLE(XMM6,XMM6,0x00)
1299 SSE_MULT_PS(XMM6,XMM0)
1300 SSE_SUB_PS(XMM5,XMM6)
1301
1302 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1303 SSE_SHUFFLE(XMM7,XMM7,0x00)
1304 SSE_MULT_PS(XMM7,XMM1)
1305 SSE_SUB_PS(XMM5,XMM7)
1306
1307 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1308 SSE_SHUFFLE(XMM4,XMM4,0x00)
1309 SSE_MULT_PS(XMM4,XMM2)
1310 SSE_SUB_PS(XMM5,XMM4)
1311
1312 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1313 SSE_SHUFFLE(XMM6,XMM6,0x00)
1314 SSE_MULT_PS(XMM6,XMM3)
1315 SSE_SUB_PS(XMM5,XMM6)
1316
1317 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1318 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1319
1320 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1321
1322 /* Third Column */
1323 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325
1326 /* Matrix-Vector Product: */
1327 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1328 SSE_SHUFFLE(XMM7,XMM7,0x00)
1329 SSE_MULT_PS(XMM7,XMM0)
1330 SSE_SUB_PS(XMM6,XMM7)
1331
1332 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1333 SSE_SHUFFLE(XMM4,XMM4,0x00)
1334 SSE_MULT_PS(XMM4,XMM1)
1335 SSE_SUB_PS(XMM6,XMM4)
1336
1337 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1338 SSE_SHUFFLE(XMM5,XMM5,0x00)
1339 SSE_MULT_PS(XMM5,XMM2)
1340 SSE_SUB_PS(XMM6,XMM5)
1341
1342 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1343 SSE_SHUFFLE(XMM7,XMM7,0x00)
1344 SSE_MULT_PS(XMM7,XMM3)
1345 SSE_SUB_PS(XMM6,XMM7)
1346
1347 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1348 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1349
1350 /* Fourth Column */
1351 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353
1354 /* Matrix-Vector Product: */
1355 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1356 SSE_SHUFFLE(XMM5,XMM5,0x00)
1357 SSE_MULT_PS(XMM5,XMM0)
1358 SSE_SUB_PS(XMM4,XMM5)
1359
1360 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1361 SSE_SHUFFLE(XMM6,XMM6,0x00)
1362 SSE_MULT_PS(XMM6,XMM1)
1363 SSE_SUB_PS(XMM4,XMM6)
1364
1365 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1366 SSE_SHUFFLE(XMM7,XMM7,0x00)
1367 SSE_MULT_PS(XMM7,XMM2)
1368 SSE_SUB_PS(XMM4,XMM7)
1369
1370 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1371 SSE_SHUFFLE(XMM5,XMM5,0x00)
1372 SSE_MULT_PS(XMM5,XMM3)
1373 SSE_SUB_PS(XMM4,XMM5)
1374
1375 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1376 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1377 SSE_INLINE_END_2;
1378 pv += 16;
1379 }
1380 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1380,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1381 }
1382 row = (unsigned int)(*bjtmp++);
1383/* row = (*bjtmp++)/4; */
1384/* bjtmp++; */
1385 }
1386 /* finished row so stick it into b->a */
1387 pv = ba + 16*bi[i];
1388 pj = bj + bi[i];
1389 nz = bi[i+1] - bi[i];
1390
1391 /* Copy x block back into pv block */
1392 for (j=0; j<nz; j++) {
1393 x = rtmp+16*((unsigned int)pj[j]);
1394/* x = rtmp+4*pj[j]; */
1395
1396 SSE_INLINE_BEGIN_2(x,pv)
1397 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1398 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1399 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1400
1401 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1402 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1403
1404 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1405 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1406
1407 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1408 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1409
1410 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1411 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1412
1413 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1414 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1415
1416 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1417 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1418
1419 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1420 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1421 SSE_INLINE_END_2;
1422 pv += 16;
1423 }
1424 /* invert diagonal block */
1425 w = ba + 16*diag_offset[i];
1426 if (pivotinblocks) {
1427 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1427,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1428 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1429 } else {
1430 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *=
-d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d
; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] +=
w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w
[2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di
; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di =
w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= -
d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di;
w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w
[9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w
[7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] *
di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 /
di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d
; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] *
di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5
] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[
14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] *
w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[
7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[
0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w
[3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w
[13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] *
di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di;
}
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1430,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1431 }
1432 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1433 }
1434
1435 ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),1435,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
) || ((rtmp) = 0,0))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1435,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1436
1437 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1438 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1439 C->assembled = PETSC_TRUE;
1440
1441 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1441,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1442 /* Flop Count from inverting diagonal blocks */
1443 SSE_SCOPE_END;
1444 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
1445}
1446
1447PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1448{
1449 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1450 PetscErrorCode ierr;
1451 int i,j,n = a->mbs;
1452 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1453 unsigned int row;
1454 int *ajtmpold,nz,*bi=b->i;
1455 int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1456 MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1457 MatScalar *ba = b->a,*aa = a->a;
1458 int nonzero=0;
1459 PetscBool pivotinblocks = b->pivotinblocks;
1460 PetscReal shift = info->shiftamount;
1461 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1462
1463 PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize
< 64)) { petscstack->function[petscstack->currentsize
] = __func__; petscstack->file[petscstack->currentsize]
= "/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
; petscstack->line[petscstack->currentsize] = 1463; petscstack
->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack
->currentsize++; } if (petscstack) { petscstack->hotdepth
+= (PETSC_FALSE || petscstack->hotdepth); } ; } while (0)
; ; } while (0)
;
1464 allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
1465 SSE_SCOPE_BEGIN;
1466
1467 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1467,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work."
)
;
1468 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1468,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work."
)
;
1469 ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1469,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1469,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1470 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1470,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work."
)
;
1471/* if ((unsigned long)bj==(unsigned long)aj) { */
1472/* colscale = 4; */
1473/* } */
1474 if ((unsigned long)bj==(unsigned long)aj) {
1475 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1476 }
1477
1478 for (i=0; i<n; i++) {
1479 nz = bi[i+1] - bi[i];
1480 bjtmp = bj + bi[i];
1481 /* zero out the 4x4 block accumulators */
1482 /* zero out one register */
1483 XOR_PS(XMM7,XMM7);
1484 for (j=0; j<nz; j++) {
1485 x = rtmp+16*((unsigned int)bjtmp[j]);
1486/* x = rtmp+4*bjtmp[j]; */
1487 SSE_INLINE_BEGIN_1(x)
1488 /* Copy zero register to memory locations */
1489 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1490 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1491 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1492 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1493 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1494 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1495 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1496 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1497 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1498 SSE_INLINE_END_1;
1499 }
1500 /* load in initial (unfactored row) */
1501 nz = ai[i+1] - ai[i];
1502 ajtmpold = aj + ai[i];
1503 v = aa + 16*ai[i];
1504 for (j=0; j<nz; j++) {
1505 x = rtmp+16*ajtmpold[j];
1506/* x = rtmp+colscale*ajtmpold[j]; */
1507 /* Copy v block into x block */
1508 SSE_INLINE_BEGIN_2(v,x)
1509 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1510 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1511 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1512
1513 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1514 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1515
1516 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1517 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1518
1519 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1520 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1521
1522 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1523 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1524
1525 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1526 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1527
1528 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1529 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1530
1531 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1532 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1533 SSE_INLINE_END_2;
1534
1535 v += 16;
1536 }
1537/* row = (*bjtmp++)/4; */
1538 row = (unsigned int)(*bjtmp++);
1539 while (row < i) {
1540 pc = rtmp + 16*row;
1541 SSE_INLINE_BEGIN_1(pc)
1542 /* Load block from lower triangle */
1543 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1544 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1545 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1546
1547 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1548 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1549
1550 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1551 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1552
1553 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1554 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1555
1556 /* Compare block to zero block */
1557
1558 SSE_COPY_PS(XMM4,XMM7)
1559 SSE_CMPNEQ_PS(XMM4,XMM0)
1560
1561 SSE_COPY_PS(XMM5,XMM7)
1562 SSE_CMPNEQ_PS(XMM5,XMM1)
1563
1564 SSE_COPY_PS(XMM6,XMM7)
1565 SSE_CMPNEQ_PS(XMM6,XMM2)
1566
1567 SSE_CMPNEQ_PS(XMM7,XMM3)
1568
1569 /* Reduce the comparisons to one SSE register */
1570 SSE_OR_PS(XMM6,XMM7)
1571 SSE_OR_PS(XMM5,XMM4)
1572 SSE_OR_PS(XMM5,XMM6)
1573 SSE_INLINE_END_1;
1574
1575 /* Reduce the one SSE register to an integer register for branching */
1576 /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1577 MOVEMASK(nonzero,XMM5);
1578
1579 /* If block is nonzero ... */
1580 if (nonzero) {
1581 pv = ba + 16*diag_offset[row];
1582 PREFETCH_L1(&pv[16]);
1583 pj = bj + diag_offset[row] + 1;
1584
1585 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1586 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1587 /* but the diagonal was inverted already */
1588 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1589
1590 SSE_INLINE_BEGIN_2(pv,pc)
1591 /* Column 0, product is accumulated in XMM4 */
1592 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1593 SSE_SHUFFLE(XMM4,XMM4,0x00)
1594 SSE_MULT_PS(XMM4,XMM0)
1595
1596 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1597 SSE_SHUFFLE(XMM5,XMM5,0x00)
1598 SSE_MULT_PS(XMM5,XMM1)
1599 SSE_ADD_PS(XMM4,XMM5)
1600
1601 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1602 SSE_SHUFFLE(XMM6,XMM6,0x00)
1603 SSE_MULT_PS(XMM6,XMM2)
1604 SSE_ADD_PS(XMM4,XMM6)
1605
1606 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1607 SSE_SHUFFLE(XMM7,XMM7,0x00)
1608 SSE_MULT_PS(XMM7,XMM3)
1609 SSE_ADD_PS(XMM4,XMM7)
1610
1611 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1612 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1613
1614 /* Column 1, product is accumulated in XMM5 */
1615 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1616 SSE_SHUFFLE(XMM5,XMM5,0x00)
1617 SSE_MULT_PS(XMM5,XMM0)
1618
1619 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1620 SSE_SHUFFLE(XMM6,XMM6,0x00)
1621 SSE_MULT_PS(XMM6,XMM1)
1622 SSE_ADD_PS(XMM5,XMM6)
1623
1624 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1625 SSE_SHUFFLE(XMM7,XMM7,0x00)
1626 SSE_MULT_PS(XMM7,XMM2)
1627 SSE_ADD_PS(XMM5,XMM7)
1628
1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1630 SSE_SHUFFLE(XMM6,XMM6,0x00)
1631 SSE_MULT_PS(XMM6,XMM3)
1632 SSE_ADD_PS(XMM5,XMM6)
1633
1634 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1635 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1636
1637 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1638
1639 /* Column 2, product is accumulated in XMM6 */
1640 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1641 SSE_SHUFFLE(XMM6,XMM6,0x00)
1642 SSE_MULT_PS(XMM6,XMM0)
1643
1644 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1645 SSE_SHUFFLE(XMM7,XMM7,0x00)
1646 SSE_MULT_PS(XMM7,XMM1)
1647 SSE_ADD_PS(XMM6,XMM7)
1648
1649 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1650 SSE_SHUFFLE(XMM7,XMM7,0x00)
1651 SSE_MULT_PS(XMM7,XMM2)
1652 SSE_ADD_PS(XMM6,XMM7)
1653
1654 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1655 SSE_SHUFFLE(XMM7,XMM7,0x00)
1656 SSE_MULT_PS(XMM7,XMM3)
1657 SSE_ADD_PS(XMM6,XMM7)
1658
1659 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1660 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1661
1662 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1663 /* Column 3, product is accumulated in XMM0 */
1664 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1665 SSE_SHUFFLE(XMM7,XMM7,0x00)
1666 SSE_MULT_PS(XMM0,XMM7)
1667
1668 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1669 SSE_SHUFFLE(XMM7,XMM7,0x00)
1670 SSE_MULT_PS(XMM1,XMM7)
1671 SSE_ADD_PS(XMM0,XMM1)
1672
1673 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1674 SSE_SHUFFLE(XMM1,XMM1,0x00)
1675 SSE_MULT_PS(XMM1,XMM2)
1676 SSE_ADD_PS(XMM0,XMM1)
1677
1678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1679 SSE_SHUFFLE(XMM7,XMM7,0x00)
1680 SSE_MULT_PS(XMM3,XMM7)
1681 SSE_ADD_PS(XMM0,XMM3)
1682
1683 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1684 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1685
1686 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1687 /* This is code to be maintained and read by humans afterall. */
1688 /* Copy Multiplier Col 3 into XMM3 */
1689 SSE_COPY_PS(XMM3,XMM0)
1690 /* Copy Multiplier Col 2 into XMM2 */
1691 SSE_COPY_PS(XMM2,XMM6)
1692 /* Copy Multiplier Col 1 into XMM1 */
1693 SSE_COPY_PS(XMM1,XMM5)
1694 /* Copy Multiplier Col 0 into XMM0 */
1695 SSE_COPY_PS(XMM0,XMM4)
1696 SSE_INLINE_END_2;
1697
1698 /* Update the row: */
1699 nz = bi[row+1] - diag_offset[row] - 1;
1700 pv += 16;
1701 for (j=0; j<nz; j++) {
1702 PREFETCH_L1(&pv[16]);
1703 x = rtmp + 16*((unsigned int)pj[j]);
1704/* x = rtmp + 4*pj[j]; */
1705
1706 /* X:=X-M*PV, One column at a time */
1707 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1708 SSE_INLINE_BEGIN_2(x,pv)
1709 /* Load First Column of X*/
1710 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1712
1713 /* Matrix-Vector Product: */
1714 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1715 SSE_SHUFFLE(XMM5,XMM5,0x00)
1716 SSE_MULT_PS(XMM5,XMM0)
1717 SSE_SUB_PS(XMM4,XMM5)
1718
1719 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1720 SSE_SHUFFLE(XMM6,XMM6,0x00)
1721 SSE_MULT_PS(XMM6,XMM1)
1722 SSE_SUB_PS(XMM4,XMM6)
1723
1724 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1725 SSE_SHUFFLE(XMM7,XMM7,0x00)
1726 SSE_MULT_PS(XMM7,XMM2)
1727 SSE_SUB_PS(XMM4,XMM7)
1728
1729 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1730 SSE_SHUFFLE(XMM5,XMM5,0x00)
1731 SSE_MULT_PS(XMM5,XMM3)
1732 SSE_SUB_PS(XMM4,XMM5)
1733
1734 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1735 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1736
1737 /* Second Column */
1738 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1740
1741 /* Matrix-Vector Product: */
1742 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1743 SSE_SHUFFLE(XMM6,XMM6,0x00)
1744 SSE_MULT_PS(XMM6,XMM0)
1745 SSE_SUB_PS(XMM5,XMM6)
1746
1747 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1748 SSE_SHUFFLE(XMM7,XMM7,0x00)
1749 SSE_MULT_PS(XMM7,XMM1)
1750 SSE_SUB_PS(XMM5,XMM7)
1751
1752 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1753 SSE_SHUFFLE(XMM4,XMM4,0x00)
1754 SSE_MULT_PS(XMM4,XMM2)
1755 SSE_SUB_PS(XMM5,XMM4)
1756
1757 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1758 SSE_SHUFFLE(XMM6,XMM6,0x00)
1759 SSE_MULT_PS(XMM6,XMM3)
1760 SSE_SUB_PS(XMM5,XMM6)
1761
1762 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1763 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1764
1765 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1766
1767 /* Third Column */
1768 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1770
1771 /* Matrix-Vector Product: */
1772 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1773 SSE_SHUFFLE(XMM7,XMM7,0x00)
1774 SSE_MULT_PS(XMM7,XMM0)
1775 SSE_SUB_PS(XMM6,XMM7)
1776
1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1778 SSE_SHUFFLE(XMM4,XMM4,0x00)
1779 SSE_MULT_PS(XMM4,XMM1)
1780 SSE_SUB_PS(XMM6,XMM4)
1781
1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1783 SSE_SHUFFLE(XMM5,XMM5,0x00)
1784 SSE_MULT_PS(XMM5,XMM2)
1785 SSE_SUB_PS(XMM6,XMM5)
1786
1787 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1788 SSE_SHUFFLE(XMM7,XMM7,0x00)
1789 SSE_MULT_PS(XMM7,XMM3)
1790 SSE_SUB_PS(XMM6,XMM7)
1791
1792 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1793 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1794
1795 /* Fourth Column */
1796 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1798
1799 /* Matrix-Vector Product: */
1800 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1801 SSE_SHUFFLE(XMM5,XMM5,0x00)
1802 SSE_MULT_PS(XMM5,XMM0)
1803 SSE_SUB_PS(XMM4,XMM5)
1804
1805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1806 SSE_SHUFFLE(XMM6,XMM6,0x00)
1807 SSE_MULT_PS(XMM6,XMM1)
1808 SSE_SUB_PS(XMM4,XMM6)
1809
1810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1811 SSE_SHUFFLE(XMM7,XMM7,0x00)
1812 SSE_MULT_PS(XMM7,XMM2)
1813 SSE_SUB_PS(XMM4,XMM7)
1814
1815 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1816 SSE_SHUFFLE(XMM5,XMM5,0x00)
1817 SSE_MULT_PS(XMM5,XMM3)
1818 SSE_SUB_PS(XMM4,XMM5)
1819
1820 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1821 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1822 SSE_INLINE_END_2;
1823 pv += 16;
1824 }
1825 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1825,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1826 }
1827 row = (unsigned int)(*bjtmp++);
1828/* row = (*bjtmp++)/4; */
1829/* bjtmp++; */
1830 }
1831 /* finished row so stick it into b->a */
1832 pv = ba + 16*bi[i];
1833 pj = bj + bi[i];
1834 nz = bi[i+1] - bi[i];
1835
1836 /* Copy x block back into pv block */
1837 for (j=0; j<nz; j++) {
1838 x = rtmp+16*((unsigned int)pj[j]);
1839/* x = rtmp+4*pj[j]; */
1840
1841 SSE_INLINE_BEGIN_2(x,pv)
1842 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1843 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1844 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1845
1846 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1847 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1848
1849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1850 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1851
1852 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1853 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1854
1855 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1856 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1857
1858 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1859 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1860
1861 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1862 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1863
1864 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1865 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1866 SSE_INLINE_END_2;
1867 pv += 16;
1868 }
1869 /* invert diagonal block */
1870 w = ba + 16*diag_offset[i];
1871 if (pivotinblocks) {
1872 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1872,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1873 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1874 } else {
1875 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *=
-d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d
; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] +=
w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w
[2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di
; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di =
w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= -
d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di;
w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w
[9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w
[7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] *
di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 /
di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d
; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] *
di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5
] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[
14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] *
w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[
7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[
0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w
[3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w
[13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] *
di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di;
}
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1875,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1876 }
1877/* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1878 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1879 }
1880
1881 ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),1881,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
) || ((rtmp) = 0,0))
;CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1881,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1882
1883 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1884 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1885 C->assembled = PETSC_TRUE;
1886
1887 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm
)0x44000001),1887,__func__,"/sandbox/petsc/petsc.next-tmp/src/mat/impls/baij/seq/baijfact11.c"
,ierr,PETSC_ERROR_REPEAT," ");} while (0)
;
1888 /* Flop Count from inverting diagonal blocks */
1889 SSE_SCOPE_END;
1890 PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize
> 0) { petscstack->currentsize--; petscstack->function
[petscstack->currentsize] = 0; petscstack->file[petscstack
->currentsize] = 0; petscstack->line[petscstack->currentsize
] = 0; petscstack->petscroutine[petscstack->currentsize
] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth =
(((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack->
hotdepth-1)); } ; } while (0); return(0);} while (0)
;
1891}
1892
1893#endif