File: | mat/impls/baij/seq/baijfact11.c |
Warning: | line 507, column 9 Value stored to 'ierr' is never read |
[?] Use j/k keys for keyboard navigation
1 | |
2 | /* |
3 | Factorization code for BAIJ format. |
4 | */ |
5 | #include <../src/mat/impls/baij/seq/baij.h> |
6 | #include <petsc/private/kernels/blockinvert.h> |
7 | |
8 | /* ------------------------------------------------------------*/ |
9 | /* |
10 | Version for when blocks are 4 by 4 |
11 | */ |
12 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) |
13 | { |
14 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; |
15 | IS isrow = b->row,isicol = b->icol; |
16 | PetscErrorCode ierr; |
17 | const PetscInt *r,*ic; |
18 | PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; |
19 | PetscInt *ajtmpold,*ajtmp,nz,row; |
20 | PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; |
21 | MatScalar *pv,*v,*rtmp,*pc,*w,*x; |
22 | MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; |
23 | MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; |
24 | MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; |
25 | MatScalar m13,m14,m15,m16; |
26 | MatScalar *ba = b->a,*aa = a->a; |
27 | PetscBool pivotinblocks = b->pivotinblocks; |
28 | PetscReal shift = info->shiftamount; |
29 | PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; |
30 | |
31 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 31; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
32 | ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),32,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
33 | ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),33,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
34 | ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,34,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),34,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
35 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
36 | |
37 | for (i=0; i<n; i++) { |
38 | nz = bi[i+1] - bi[i]; |
39 | ajtmp = bj + bi[i]; |
40 | for (j=0; j<nz; j++) { |
41 | x = rtmp+16*ajtmp[j]; |
42 | x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; |
43 | x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; |
44 | } |
45 | /* load in initial (unfactored row) */ |
46 | idx = r[i]; |
47 | nz = ai[idx+1] - ai[idx]; |
48 | ajtmpold = aj + ai[idx]; |
49 | v = aa + 16*ai[idx]; |
50 | for (j=0; j<nz; j++) { |
51 | x = rtmp+16*ic[ajtmpold[j]]; |
52 | x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; |
53 | x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; |
54 | x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; |
55 | x[14] = v[14]; x[15] = v[15]; |
56 | v += 16; |
57 | } |
58 | row = *ajtmp++; |
59 | while (row < i) { |
60 | pc = rtmp + 16*row; |
61 | p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; |
62 | p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; |
63 | p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; |
64 | p15 = pc[14]; p16 = pc[15]; |
65 | if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || |
66 | p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || |
67 | p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 |
68 | || p16 != 0.0) { |
69 | pv = ba + 16*diag_offset[row]; |
70 | pj = bj + diag_offset[row] + 1; |
71 | x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; |
72 | x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; |
73 | x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; |
74 | x15 = pv[14]; x16 = pv[15]; |
75 | pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; |
76 | pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; |
77 | pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; |
78 | pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; |
79 | |
80 | pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; |
81 | pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; |
82 | pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; |
83 | pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; |
84 | |
85 | pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; |
86 | pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; |
87 | pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; |
88 | pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; |
89 | |
90 | pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; |
91 | pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; |
92 | pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; |
93 | pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; |
94 | |
95 | nz = bi[row+1] - diag_offset[row] - 1; |
96 | pv += 16; |
97 | for (j=0; j<nz; j++) { |
98 | x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; |
99 | x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; |
100 | x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; |
101 | x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; |
102 | x = rtmp + 16*pj[j]; |
103 | x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; |
104 | x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; |
105 | x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; |
106 | x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; |
107 | |
108 | x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; |
109 | x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; |
110 | x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; |
111 | x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; |
112 | |
113 | x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; |
114 | x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; |
115 | x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; |
116 | x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; |
117 | |
118 | x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; |
119 | x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; |
120 | x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; |
121 | x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; |
122 | |
123 | pv += 16; |
124 | } |
125 | ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),125,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
126 | } |
127 | row = *ajtmp++; |
128 | } |
129 | /* finished row so stick it into b->a */ |
130 | pv = ba + 16*bi[i]; |
131 | pj = bj + bi[i]; |
132 | nz = bi[i+1] - bi[i]; |
133 | for (j=0; j<nz; j++) { |
134 | x = rtmp+16*pj[j]; |
135 | pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; |
136 | pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; |
137 | pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; |
138 | pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; |
139 | pv += 16; |
140 | } |
141 | /* invert diagonal block */ |
142 | w = ba + 16*diag_offset[i]; |
143 | if (pivotinblocks) { |
144 | ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),144,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
145 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
146 | } else { |
147 | ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *= -d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= - d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d ; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[ 7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),147,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
148 | } |
149 | } |
150 | |
151 | ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),151,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) || ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),151,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
152 | ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),152,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
153 | ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),153,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
154 | |
155 | C->ops->solve = MatSolve_SeqBAIJ_4_inplace; |
156 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; |
157 | C->assembled = PETSC_TRUE; |
158 | |
159 | ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),159,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */ |
160 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
161 | } |
162 | |
163 | /* MatLUFactorNumeric_SeqBAIJ_4 - |
164 | copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented |
165 | PetscKernel_A_gets_A_times_B() |
166 | PetscKernel_A_gets_A_minus_B_times_C() |
167 | PetscKernel_A_gets_inverse_A() |
168 | */ |
169 | |
170 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) |
171 | { |
172 | Mat C = B; |
173 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; |
174 | IS isrow = b->row,isicol = b->icol; |
175 | PetscErrorCode ierr; |
176 | const PetscInt *r,*ic; |
177 | PetscInt i,j,k,nz,nzL,row; |
178 | const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; |
179 | const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; |
180 | MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; |
181 | PetscInt flg; |
182 | PetscReal shift; |
183 | PetscBool allowzeropivot,zeropivotdetected; |
184 | |
185 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 185; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
186 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
187 | ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),187,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
188 | ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),188,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
189 | |
190 | if (info->shifttype == MAT_SHIFT_NONE) { |
191 | shift = 0; |
192 | } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ |
193 | shift = info->shiftamount; |
194 | } |
195 | |
196 | /* generate work space needed by the factorization */ |
197 | ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,197,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(bs2*n)*sizeof(**(&rtmp)),(&rtmp),(size_t)(bs2 )*sizeof(**(&mwork)),(&mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),197,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
198 | ierr = PetscArrayzero(rtmp,bs2*n)PetscMemzero(rtmp,(bs2*n)*sizeof(*(rtmp)));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),198,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
199 | |
200 | for (i=0; i<n; i++) { |
201 | /* zero rtmp */ |
202 | /* L part */ |
203 | nz = bi[i+1] - bi[i]; |
204 | bjtmp = bj + bi[i]; |
205 | for (j=0; j<nz; j++) { |
206 | ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),206,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
207 | } |
208 | |
209 | /* U part */ |
210 | nz = bdiag[i]-bdiag[i+1]; |
211 | bjtmp = bj + bdiag[i+1]+1; |
212 | for (j=0; j<nz; j++) { |
213 | ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),213,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
214 | } |
215 | |
216 | /* load in initial (unfactored row) */ |
217 | nz = ai[r[i]+1] - ai[r[i]]; |
218 | ajtmp = aj + ai[r[i]]; |
219 | v = aa + bs2*ai[r[i]]; |
220 | for (j=0; j<nz; j++) { |
221 | ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2)((sizeof(*(rtmp+bs2*ic[ajtmp[j]])) != sizeof(*(v+bs2*j))) || PetscMemcpy (rtmp+bs2*ic[ajtmp[j]],v+bs2*j,(bs2)*sizeof(*(rtmp+bs2*ic[ajtmp [j]]))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),221,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
222 | } |
223 | |
224 | /* elimination */ |
225 | bjtmp = bj + bi[i]; |
226 | nzL = bi[i+1] - bi[i]; |
227 | for (k=0; k < nzL; k++) { |
228 | row = bjtmp[k]; |
229 | pc = rtmp + bs2*row; |
230 | for (flg=0,j=0; j<bs2; j++) { |
231 | if (pc[j]!=0.0) { |
232 | flg = 1; |
233 | break; |
234 | } |
235 | } |
236 | if (flg) { |
237 | pv = b->a + bs2*bdiag[row]; |
238 | /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ |
239 | ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof(*(mwork)) != sizeof(*(pc))) || PetscMemcpy (mwork,pc,(16)*sizeof(*(mwork))));do {if (__builtin_expect(!! (ierr),0)) return PetscError(((MPI_Comm)0x44000001),239,__func__ ,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]*pv [0] + mwork[4]*pv[1] + mwork[8]*pv[2] + mwork[12]*pv[3]; pc[1 ] = mwork[1]*pv[0] + mwork[5]*pv[1] + mwork[9]*pv[2] + mwork[ 13]*pv[3]; pc[2] = mwork[2]*pv[0] + mwork[6]*pv[1] + mwork[10 ]*pv[2] + mwork[14]*pv[3]; pc[3] = mwork[3]*pv[0] + mwork[7]* pv[1] + mwork[11]*pv[2] + mwork[15]*pv[3]; pc[4] = mwork[0]*pv [4] + mwork[4]*pv[5] + mwork[8]*pv[6] + mwork[12]*pv[7]; pc[5 ] = mwork[1]*pv[4] + mwork[5]*pv[5] + mwork[9]*pv[6] + mwork[ 13]*pv[7]; pc[6] = mwork[2]*pv[4] + mwork[6]*pv[5] + mwork[10 ]*pv[6] + mwork[14]*pv[7]; pc[7] = mwork[3]*pv[4] + mwork[7]* pv[5] + mwork[11]*pv[6] + mwork[15]*pv[7]; pc[8] = mwork[0]*pv [8] + mwork[4]*pv[9] + mwork[8]*pv[10] + mwork[12]*pv[11]; pc [9] = mwork[1]*pv[8] + mwork[5]*pv[9] + mwork[9]*pv[10] + mwork [13]*pv[11]; pc[10] = mwork[2]*pv[8] + mwork[6]*pv[9] + mwork [10]*pv[10] + mwork[14]*pv[11]; pc[11] = mwork[3]*pv[8] + mwork [7]*pv[9] + mwork[11]*pv[10] + mwork[15]*pv[11]; pc[12] = mwork [0]*pv[12] + mwork[4]*pv[13] + mwork[8]*pv[14] + mwork[12]*pv [15]; pc[13] = mwork[1]*pv[12] + mwork[5]*pv[13] + mwork[9]*pv [14] + mwork[13]*pv[15]; pc[14] = mwork[2]*pv[12] + mwork[6]* pv[13] + mwork[10]*pv[14] + mwork[14]*pv[15]; pc[15] = mwork[ 3]*pv[12] + mwork[7]*pv[13] + mwork[11]*pv[14] + mwork[15]*pv [15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),239,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
240 | |
241 | pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ |
242 | pv = b->a + bs2*(bdiag[row+1]+1); |
243 | nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ |
244 | for (j=0; j<nz; j++) { |
245 | /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ |
246 | /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ |
247 | v = rtmp + bs2*pj[j]; |
248 | ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]*pv[0] + pc[4]*pv[1] + pc[8]*pv[2] + pc[12] *pv[3]; v[1] -= pc[1]*pv[0] + pc[5]*pv[1] + pc[9]*pv[2] + pc[ 13]*pv[3]; v[2] -= pc[2]*pv[0] + pc[6]*pv[1] + pc[10]*pv[2] + pc[14]*pv[3]; v[3] -= pc[3]*pv[0] + pc[7]*pv[1] + pc[11]*pv[ 2] + pc[15]*pv[3]; v[4] -= pc[0]*pv[4] + pc[4]*pv[5] + pc[8]* pv[6] + pc[12]*pv[7]; v[5] -= pc[1]*pv[4] + pc[5]*pv[5] + pc[ 9]*pv[6] + pc[13]*pv[7]; v[6] -= pc[2]*pv[4] + pc[6]*pv[5] + pc [10]*pv[6] + pc[14]*pv[7]; v[7] -= pc[3]*pv[4] + pc[7]*pv[5] + pc[11]*pv[6] + pc[15]*pv[7]; v[8] -= pc[0]*pv[8] + pc[4]*pv[ 9] + pc[8]*pv[10] + pc[12]*pv[11]; v[9] -= pc[1]*pv[8] + pc[5 ]*pv[9] + pc[9]*pv[10] + pc[13]*pv[11]; v[10] -= pc[2]*pv[8] + pc[6]*pv[9] + pc[10]*pv[10] + pc[14]*pv[11]; v[11] -= pc[3]* pv[8] + pc[7]*pv[9] + pc[11]*pv[10] + pc[15]*pv[11]; v[12] -= pc[0]*pv[12] + pc[4]*pv[13] + pc[8]*pv[14] + pc[12]*pv[15]; v [13] -= pc[1]*pv[12] + pc[5]*pv[13] + pc[9]*pv[14] + pc[13]*pv [15]; v[14] -= pc[2]*pv[12] + pc[6]*pv[13] + pc[10]*pv[14] + pc [14]*pv[15]; v[15] -= pc[3]*pv[12] + pc[7]*pv[13] + pc[11]*pv [14] + pc[15]*pv[15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),248,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
249 | pv += bs2; |
250 | } |
251 | ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),251,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ |
252 | } |
253 | } |
254 | |
255 | /* finished row so stick it into b->a */ |
256 | /* L part */ |
257 | pv = b->a + bs2*bi[i]; |
258 | pj = b->j + bi[i]; |
259 | nz = bi[i+1] - bi[i]; |
260 | for (j=0; j<nz; j++) { |
261 | ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy (pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),261,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
262 | } |
263 | |
264 | /* Mark diagonal and invert diagonal for simplier triangular solves */ |
265 | pv = b->a + bs2*bdiag[i]; |
266 | pj = b->j + bdiag[i]; |
267 | ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)((sizeof(*(pv)) != sizeof(*(rtmp+bs2*pj[0]))) || PetscMemcpy( pv,rtmp+bs2*pj[0],(bs2)*sizeof(*(pv))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),267,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
268 | ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),268,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
269 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
270 | |
271 | /* U part */ |
272 | pv = b->a + bs2*(bdiag[i+1]+1); |
273 | pj = b->j + bdiag[i+1]+1; |
274 | nz = bdiag[i] - bdiag[i+1] - 1; |
275 | for (j=0; j<nz; j++) { |
276 | ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy (pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),276,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
277 | } |
278 | } |
279 | |
280 | ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,280,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,&(rtmp),&(mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),280,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
281 | ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),281,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
282 | ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),282,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
283 | |
284 | C->ops->solve = MatSolve_SeqBAIJ_4; |
285 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; |
286 | C->assembled = PETSC_TRUE; |
287 | |
288 | ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),288,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */ |
289 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
290 | } |
291 | |
292 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) |
293 | { |
294 | /* |
295 | Default Version for when blocks are 4 by 4 Using natural ordering |
296 | */ |
297 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; |
298 | PetscErrorCode ierr; |
299 | PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; |
300 | PetscInt *ajtmpold,*ajtmp,nz,row; |
301 | PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; |
302 | MatScalar *pv,*v,*rtmp,*pc,*w,*x; |
303 | MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; |
304 | MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; |
305 | MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; |
306 | MatScalar m13,m14,m15,m16; |
307 | MatScalar *ba = b->a,*aa = a->a; |
308 | PetscBool pivotinblocks = b->pivotinblocks; |
309 | PetscReal shift = info->shiftamount; |
310 | PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; |
311 | |
312 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 312; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
313 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
314 | ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,314,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),314,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
315 | |
316 | for (i=0; i<n; i++) { |
317 | nz = bi[i+1] - bi[i]; |
318 | ajtmp = bj + bi[i]; |
319 | for (j=0; j<nz; j++) { |
320 | x = rtmp+16*ajtmp[j]; |
321 | x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; |
322 | x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; |
323 | } |
324 | /* load in initial (unfactored row) */ |
325 | nz = ai[i+1] - ai[i]; |
326 | ajtmpold = aj + ai[i]; |
327 | v = aa + 16*ai[i]; |
328 | for (j=0; j<nz; j++) { |
329 | x = rtmp+16*ajtmpold[j]; |
330 | x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; |
331 | x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; |
332 | x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; |
333 | x[14] = v[14]; x[15] = v[15]; |
334 | v += 16; |
335 | } |
336 | row = *ajtmp++; |
337 | while (row < i) { |
338 | pc = rtmp + 16*row; |
339 | p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; |
340 | p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; |
341 | p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; |
342 | p15 = pc[14]; p16 = pc[15]; |
343 | if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || |
344 | p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || |
345 | p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 |
346 | || p16 != 0.0) { |
347 | pv = ba + 16*diag_offset[row]; |
348 | pj = bj + diag_offset[row] + 1; |
349 | x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; |
350 | x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; |
351 | x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; |
352 | x15 = pv[14]; x16 = pv[15]; |
353 | pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; |
354 | pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; |
355 | pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; |
356 | pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; |
357 | |
358 | pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; |
359 | pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; |
360 | pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; |
361 | pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; |
362 | |
363 | pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; |
364 | pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; |
365 | pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; |
366 | pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; |
367 | |
368 | pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; |
369 | pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; |
370 | pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; |
371 | pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; |
372 | nz = bi[row+1] - diag_offset[row] - 1; |
373 | pv += 16; |
374 | for (j=0; j<nz; j++) { |
375 | x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; |
376 | x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; |
377 | x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; |
378 | x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; |
379 | x = rtmp + 16*pj[j]; |
380 | x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; |
381 | x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; |
382 | x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; |
383 | x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; |
384 | |
385 | x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; |
386 | x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; |
387 | x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; |
388 | x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; |
389 | |
390 | x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; |
391 | x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; |
392 | x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; |
393 | x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; |
394 | |
395 | x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; |
396 | x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; |
397 | x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; |
398 | x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; |
399 | |
400 | pv += 16; |
401 | } |
402 | ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),402,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
403 | } |
404 | row = *ajtmp++; |
405 | } |
406 | /* finished row so stick it into b->a */ |
407 | pv = ba + 16*bi[i]; |
408 | pj = bj + bi[i]; |
409 | nz = bi[i+1] - bi[i]; |
410 | for (j=0; j<nz; j++) { |
411 | x = rtmp+16*pj[j]; |
412 | pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; |
413 | pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; |
414 | pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; |
415 | pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; |
416 | pv += 16; |
417 | } |
418 | /* invert diagonal block */ |
419 | w = ba + 16*diag_offset[i]; |
420 | if (pivotinblocks) { |
421 | ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),421,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
422 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
423 | } else { |
424 | ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *= -d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= - d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d ; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[ 7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),424,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
425 | } |
426 | } |
427 | |
428 | ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),428,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) || ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),428,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
429 | |
430 | C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; |
431 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; |
432 | C->assembled = PETSC_TRUE; |
433 | |
434 | ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),434,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */ |
435 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
436 | } |
437 | |
438 | /* |
439 | MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - |
440 | copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() |
441 | */ |
442 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) |
443 | { |
444 | Mat C =B; |
445 | Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; |
446 | PetscErrorCode ierr; |
447 | PetscInt i,j,k,nz,nzL,row; |
448 | const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; |
449 | const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; |
450 | MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; |
451 | PetscInt flg; |
452 | PetscReal shift; |
453 | PetscBool allowzeropivot,zeropivotdetected; |
454 | |
455 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 455; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
456 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
457 | |
458 | /* generate work space needed by the factorization */ |
459 | ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,459,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(bs2*n)*sizeof(**(&rtmp)),(&rtmp),(size_t)(bs2 )*sizeof(**(&mwork)),(&mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),459,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
460 | ierr = PetscArrayzero(rtmp,bs2*n)PetscMemzero(rtmp,(bs2*n)*sizeof(*(rtmp)));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),460,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
461 | |
462 | if (info->shifttype == MAT_SHIFT_NONE) { |
463 | shift = 0; |
464 | } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ |
465 | shift = info->shiftamount; |
466 | } |
467 | |
468 | for (i=0; i<n; i++) { |
469 | /* zero rtmp */ |
470 | /* L part */ |
471 | nz = bi[i+1] - bi[i]; |
472 | bjtmp = bj + bi[i]; |
473 | for (j=0; j<nz; j++) { |
474 | ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),474,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
475 | } |
476 | |
477 | /* U part */ |
478 | nz = bdiag[i] - bdiag[i+1]; |
479 | bjtmp = bj + bdiag[i+1]+1; |
480 | for (j=0; j<nz; j++) { |
481 | ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)PetscMemzero(rtmp+bs2*bjtmp[j],(bs2)*sizeof(*(rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),481,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
482 | } |
483 | |
484 | /* load in initial (unfactored row) */ |
485 | nz = ai[i+1] - ai[i]; |
486 | ajtmp = aj + ai[i]; |
487 | v = aa + bs2*ai[i]; |
488 | for (j=0; j<nz; j++) { |
489 | ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2)((sizeof(*(rtmp+bs2*ajtmp[j])) != sizeof(*(v+bs2*j))) || PetscMemcpy (rtmp+bs2*ajtmp[j],v+bs2*j,(bs2)*sizeof(*(rtmp+bs2*ajtmp[j])) ));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),489,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
490 | } |
491 | |
492 | /* elimination */ |
493 | bjtmp = bj + bi[i]; |
494 | nzL = bi[i+1] - bi[i]; |
495 | for (k=0; k < nzL; k++) { |
496 | row = bjtmp[k]; |
497 | pc = rtmp + bs2*row; |
498 | for (flg=0,j=0; j<bs2; j++) { |
499 | if (pc[j]!=0.0) { |
500 | flg = 1; |
501 | break; |
502 | } |
503 | } |
504 | if (flg) { |
505 | pv = b->a + bs2*bdiag[row]; |
506 | /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ |
507 | ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof(*(mwork)) != sizeof(*(pc))) || PetscMemcpy (mwork,pc,(16)*sizeof(*(mwork))));do {if (__builtin_expect(!! (ierr),0)) return PetscError(((MPI_Comm)0x44000001),507,__func__ ,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]*pv [0] + mwork[4]*pv[1] + mwork[8]*pv[2] + mwork[12]*pv[3]; pc[1 ] = mwork[1]*pv[0] + mwork[5]*pv[1] + mwork[9]*pv[2] + mwork[ 13]*pv[3]; pc[2] = mwork[2]*pv[0] + mwork[6]*pv[1] + mwork[10 ]*pv[2] + mwork[14]*pv[3]; pc[3] = mwork[3]*pv[0] + mwork[7]* pv[1] + mwork[11]*pv[2] + mwork[15]*pv[3]; pc[4] = mwork[0]*pv [4] + mwork[4]*pv[5] + mwork[8]*pv[6] + mwork[12]*pv[7]; pc[5 ] = mwork[1]*pv[4] + mwork[5]*pv[5] + mwork[9]*pv[6] + mwork[ 13]*pv[7]; pc[6] = mwork[2]*pv[4] + mwork[6]*pv[5] + mwork[10 ]*pv[6] + mwork[14]*pv[7]; pc[7] = mwork[3]*pv[4] + mwork[7]* pv[5] + mwork[11]*pv[6] + mwork[15]*pv[7]; pc[8] = mwork[0]*pv [8] + mwork[4]*pv[9] + mwork[8]*pv[10] + mwork[12]*pv[11]; pc [9] = mwork[1]*pv[8] + mwork[5]*pv[9] + mwork[9]*pv[10] + mwork [13]*pv[11]; pc[10] = mwork[2]*pv[8] + mwork[6]*pv[9] + mwork [10]*pv[10] + mwork[14]*pv[11]; pc[11] = mwork[3]*pv[8] + mwork [7]*pv[9] + mwork[11]*pv[10] + mwork[15]*pv[11]; pc[12] = mwork [0]*pv[12] + mwork[4]*pv[13] + mwork[8]*pv[14] + mwork[12]*pv [15]; pc[13] = mwork[1]*pv[12] + mwork[5]*pv[13] + mwork[9]*pv [14] + mwork[13]*pv[15]; pc[14] = mwork[2]*pv[12] + mwork[6]* pv[13] + mwork[10]*pv[14] + mwork[14]*pv[15]; pc[15] = mwork[ 3]*pv[12] + mwork[7]*pv[13] + mwork[11]*pv[14] + mwork[15]*pv [15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),507,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
Value stored to 'ierr' is never read | |
508 | |
509 | pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ |
510 | pv = b->a + bs2*(bdiag[row+1]+1); |
511 | nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ |
512 | for (j=0; j<nz; j++) { |
513 | /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ |
514 | /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ |
515 | v = rtmp + bs2*pj[j]; |
516 | ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]*pv[0] + pc[4]*pv[1] + pc[8]*pv[2] + pc[12] *pv[3]; v[1] -= pc[1]*pv[0] + pc[5]*pv[1] + pc[9]*pv[2] + pc[ 13]*pv[3]; v[2] -= pc[2]*pv[0] + pc[6]*pv[1] + pc[10]*pv[2] + pc[14]*pv[3]; v[3] -= pc[3]*pv[0] + pc[7]*pv[1] + pc[11]*pv[ 2] + pc[15]*pv[3]; v[4] -= pc[0]*pv[4] + pc[4]*pv[5] + pc[8]* pv[6] + pc[12]*pv[7]; v[5] -= pc[1]*pv[4] + pc[5]*pv[5] + pc[ 9]*pv[6] + pc[13]*pv[7]; v[6] -= pc[2]*pv[4] + pc[6]*pv[5] + pc [10]*pv[6] + pc[14]*pv[7]; v[7] -= pc[3]*pv[4] + pc[7]*pv[5] + pc[11]*pv[6] + pc[15]*pv[7]; v[8] -= pc[0]*pv[8] + pc[4]*pv[ 9] + pc[8]*pv[10] + pc[12]*pv[11]; v[9] -= pc[1]*pv[8] + pc[5 ]*pv[9] + pc[9]*pv[10] + pc[13]*pv[11]; v[10] -= pc[2]*pv[8] + pc[6]*pv[9] + pc[10]*pv[10] + pc[14]*pv[11]; v[11] -= pc[3]* pv[8] + pc[7]*pv[9] + pc[11]*pv[10] + pc[15]*pv[11]; v[12] -= pc[0]*pv[12] + pc[4]*pv[13] + pc[8]*pv[14] + pc[12]*pv[15]; v [13] -= pc[1]*pv[12] + pc[5]*pv[13] + pc[9]*pv[14] + pc[13]*pv [15]; v[14] -= pc[2]*pv[12] + pc[6]*pv[13] + pc[10]*pv[14] + pc [14]*pv[15]; v[15] -= pc[3]*pv[12] + pc[7]*pv[13] + pc[11]*pv [14] + pc[15]*pv[15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),516,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
517 | pv += bs2; |
518 | } |
519 | ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),519,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ |
520 | } |
521 | } |
522 | |
523 | /* finished row so stick it into b->a */ |
524 | /* L part */ |
525 | pv = b->a + bs2*bi[i]; |
526 | pj = b->j + bi[i]; |
527 | nz = bi[i+1] - bi[i]; |
528 | for (j=0; j<nz; j++) { |
529 | ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy (pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),529,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
530 | } |
531 | |
532 | /* Mark diagonal and invert diagonal for simplier triangular solves */ |
533 | pv = b->a + bs2*bdiag[i]; |
534 | pj = b->j + bdiag[i]; |
535 | ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)((sizeof(*(pv)) != sizeof(*(rtmp+bs2*pj[0]))) || PetscMemcpy( pv,rtmp+bs2*pj[0],(bs2)*sizeof(*(pv))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),535,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
536 | ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),536,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
537 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
538 | |
539 | /* U part */ |
540 | pv = b->a + bs2*(bdiag[i+1]+1); |
541 | pj = b->j + bdiag[i+1]+1; |
542 | nz = bdiag[i] - bdiag[i+1] - 1; |
543 | for (j=0; j<nz; j++) { |
544 | ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)((sizeof(*(pv+bs2*j)) != sizeof(*(rtmp+bs2*pj[j]))) || PetscMemcpy (pv+bs2*j,rtmp+bs2*pj[j],(bs2)*sizeof(*(pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),544,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
545 | } |
546 | } |
547 | ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,547,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,&(rtmp),&(mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),547,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
548 | |
549 | C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; |
550 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; |
551 | C->assembled = PETSC_TRUE; |
552 | |
553 | ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),553,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */ |
554 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
555 | } |
556 | |
557 | #if defined(PETSC_HAVE_SSE) |
558 | |
559 | #include PETSC_HAVE_SSE |
560 | |
561 | /* SSE Version for when blocks are 4 by 4 Using natural ordering */ |
562 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) |
563 | { |
564 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; |
565 | PetscErrorCode ierr; |
566 | int i,j,n = a->mbs; |
567 | int *bj = b->j,*bjtmp,*pj; |
568 | int row; |
569 | int *ajtmpold,nz,*bi=b->i; |
570 | int *diag_offset = b->diag,*ai=a->i,*aj=a->j; |
571 | MatScalar *pv,*v,*rtmp,*pc,*w,*x; |
572 | MatScalar *ba = b->a,*aa = a->a; |
573 | int nonzero=0; |
574 | PetscBool pivotinblocks = b->pivotinblocks; |
575 | PetscReal shift = info->shiftamount; |
576 | PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; |
577 | |
578 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 578; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
579 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
580 | SSE_SCOPE_BEGIN; |
581 | |
582 | if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),582,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." ); |
583 | if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),583,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." ); |
584 | ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,584,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),584,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
585 | if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),585,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." ); |
586 | /* if ((unsigned long)bj==(unsigned long)aj) { */ |
587 | /* colscale = 4; */ |
588 | /* } */ |
589 | for (i=0; i<n; i++) { |
590 | nz = bi[i+1] - bi[i]; |
591 | bjtmp = bj + bi[i]; |
592 | /* zero out the 4x4 block accumulators */ |
593 | /* zero out one register */ |
594 | XOR_PS(XMM7,XMM7); |
595 | for (j=0; j<nz; j++) { |
596 | x = rtmp+16*bjtmp[j]; |
597 | /* x = rtmp+4*bjtmp[j]; */ |
598 | SSE_INLINE_BEGIN_1(x) |
599 | /* Copy zero register to memory locations */ |
600 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
601 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) |
602 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) |
603 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) |
604 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) |
605 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) |
606 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) |
607 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
608 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) |
609 | SSE_INLINE_END_1; |
610 | } |
611 | /* load in initial (unfactored row) */ |
612 | nz = ai[i+1] - ai[i]; |
613 | ajtmpold = aj + ai[i]; |
614 | v = aa + 16*ai[i]; |
615 | for (j=0; j<nz; j++) { |
616 | x = rtmp+16*ajtmpold[j]; |
617 | /* x = rtmp+colscale*ajtmpold[j]; */ |
618 | /* Copy v block into x block */ |
619 | SSE_INLINE_BEGIN_2(v,x) |
620 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
621 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
622 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) |
623 | |
624 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) |
625 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) |
626 | |
627 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) |
628 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) |
629 | |
630 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) |
631 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) |
632 | |
633 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) |
634 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) |
635 | |
636 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) |
637 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) |
638 | |
639 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) |
640 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) |
641 | |
642 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
643 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
644 | SSE_INLINE_END_2; |
645 | |
646 | v += 16; |
647 | } |
648 | /* row = (*bjtmp++)/4; */ |
649 | row = *bjtmp++; |
650 | while (row < i) { |
651 | pc = rtmp + 16*row; |
652 | SSE_INLINE_BEGIN_1(pc) |
653 | /* Load block from lower triangle */ |
654 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
655 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
656 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) |
657 | |
658 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) |
659 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) |
660 | |
661 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) |
662 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) |
663 | |
664 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) |
665 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) |
666 | |
667 | /* Compare block to zero block */ |
668 | |
669 | SSE_COPY_PS(XMM4,XMM7) |
670 | SSE_CMPNEQ_PS(XMM4,XMM0) |
671 | |
672 | SSE_COPY_PS(XMM5,XMM7) |
673 | SSE_CMPNEQ_PS(XMM5,XMM1) |
674 | |
675 | SSE_COPY_PS(XMM6,XMM7) |
676 | SSE_CMPNEQ_PS(XMM6,XMM2) |
677 | |
678 | SSE_CMPNEQ_PS(XMM7,XMM3) |
679 | |
680 | /* Reduce the comparisons to one SSE register */ |
681 | SSE_OR_PS(XMM6,XMM7) |
682 | SSE_OR_PS(XMM5,XMM4) |
683 | SSE_OR_PS(XMM5,XMM6) |
684 | SSE_INLINE_END_1; |
685 | |
686 | /* Reduce the one SSE register to an integer register for branching */ |
687 | /* Note: Since nonzero is an int, there is no INLINE block version of this call */ |
688 | MOVEMASK(nonzero,XMM5); |
689 | |
690 | /* If block is nonzero ... */ |
691 | if (nonzero) { |
692 | pv = ba + 16*diag_offset[row]; |
693 | PREFETCH_L1(&pv[16]); |
694 | pj = bj + diag_offset[row] + 1; |
695 | |
696 | /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ |
697 | /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ |
698 | /* but the diagonal was inverted already */ |
699 | /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ |
700 | |
701 | SSE_INLINE_BEGIN_2(pv,pc) |
702 | /* Column 0, product is accumulated in XMM4 */ |
703 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) |
704 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
705 | SSE_MULT_PS(XMM4,XMM0) |
706 | |
707 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) |
708 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
709 | SSE_MULT_PS(XMM5,XMM1) |
710 | SSE_ADD_PS(XMM4,XMM5) |
711 | |
712 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) |
713 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
714 | SSE_MULT_PS(XMM6,XMM2) |
715 | SSE_ADD_PS(XMM4,XMM6) |
716 | |
717 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) |
718 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
719 | SSE_MULT_PS(XMM7,XMM3) |
720 | SSE_ADD_PS(XMM4,XMM7) |
721 | |
722 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) |
723 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) |
724 | |
725 | /* Column 1, product is accumulated in XMM5 */ |
726 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) |
727 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
728 | SSE_MULT_PS(XMM5,XMM0) |
729 | |
730 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) |
731 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
732 | SSE_MULT_PS(XMM6,XMM1) |
733 | SSE_ADD_PS(XMM5,XMM6) |
734 | |
735 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) |
736 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
737 | SSE_MULT_PS(XMM7,XMM2) |
738 | SSE_ADD_PS(XMM5,XMM7) |
739 | |
740 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) |
741 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
742 | SSE_MULT_PS(XMM6,XMM3) |
743 | SSE_ADD_PS(XMM5,XMM6) |
744 | |
745 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) |
746 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) |
747 | |
748 | SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) |
749 | |
750 | /* Column 2, product is accumulated in XMM6 */ |
751 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) |
752 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
753 | SSE_MULT_PS(XMM6,XMM0) |
754 | |
755 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) |
756 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
757 | SSE_MULT_PS(XMM7,XMM1) |
758 | SSE_ADD_PS(XMM6,XMM7) |
759 | |
760 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) |
761 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
762 | SSE_MULT_PS(XMM7,XMM2) |
763 | SSE_ADD_PS(XMM6,XMM7) |
764 | |
765 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) |
766 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
767 | SSE_MULT_PS(XMM7,XMM3) |
768 | SSE_ADD_PS(XMM6,XMM7) |
769 | |
770 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) |
771 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
772 | |
773 | /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ |
774 | /* Column 3, product is accumulated in XMM0 */ |
775 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) |
776 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
777 | SSE_MULT_PS(XMM0,XMM7) |
778 | |
779 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) |
780 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
781 | SSE_MULT_PS(XMM1,XMM7) |
782 | SSE_ADD_PS(XMM0,XMM1) |
783 | |
784 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) |
785 | SSE_SHUFFLE(XMM1,XMM1,0x00) |
786 | SSE_MULT_PS(XMM1,XMM2) |
787 | SSE_ADD_PS(XMM0,XMM1) |
788 | |
789 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) |
790 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
791 | SSE_MULT_PS(XMM3,XMM7) |
792 | SSE_ADD_PS(XMM0,XMM3) |
793 | |
794 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) |
795 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
796 | |
797 | /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ |
798 | /* This is code to be maintained and read by humans afterall. */ |
799 | /* Copy Multiplier Col 3 into XMM3 */ |
800 | SSE_COPY_PS(XMM3,XMM0) |
801 | /* Copy Multiplier Col 2 into XMM2 */ |
802 | SSE_COPY_PS(XMM2,XMM6) |
803 | /* Copy Multiplier Col 1 into XMM1 */ |
804 | SSE_COPY_PS(XMM1,XMM5) |
805 | /* Copy Multiplier Col 0 into XMM0 */ |
806 | SSE_COPY_PS(XMM0,XMM4) |
807 | SSE_INLINE_END_2; |
808 | |
809 | /* Update the row: */ |
810 | nz = bi[row+1] - diag_offset[row] - 1; |
811 | pv += 16; |
812 | for (j=0; j<nz; j++) { |
813 | PREFETCH_L1(&pv[16]); |
814 | x = rtmp + 16*pj[j]; |
815 | /* x = rtmp + 4*pj[j]; */ |
816 | |
817 | /* X:=X-M*PV, One column at a time */ |
818 | /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ |
819 | SSE_INLINE_BEGIN_2(x,pv) |
820 | /* Load First Column of X*/ |
821 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
822 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
823 | |
824 | /* Matrix-Vector Product: */ |
825 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) |
826 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
827 | SSE_MULT_PS(XMM5,XMM0) |
828 | SSE_SUB_PS(XMM4,XMM5) |
829 | |
830 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) |
831 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
832 | SSE_MULT_PS(XMM6,XMM1) |
833 | SSE_SUB_PS(XMM4,XMM6) |
834 | |
835 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) |
836 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
837 | SSE_MULT_PS(XMM7,XMM2) |
838 | SSE_SUB_PS(XMM4,XMM7) |
839 | |
840 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) |
841 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
842 | SSE_MULT_PS(XMM5,XMM3) |
843 | SSE_SUB_PS(XMM4,XMM5) |
844 | |
845 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
846 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
847 | |
848 | /* Second Column */ |
849 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
850 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
851 | |
852 | /* Matrix-Vector Product: */ |
853 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) |
854 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
855 | SSE_MULT_PS(XMM6,XMM0) |
856 | SSE_SUB_PS(XMM5,XMM6) |
857 | |
858 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) |
859 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
860 | SSE_MULT_PS(XMM7,XMM1) |
861 | SSE_SUB_PS(XMM5,XMM7) |
862 | |
863 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) |
864 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
865 | SSE_MULT_PS(XMM4,XMM2) |
866 | SSE_SUB_PS(XMM5,XMM4) |
867 | |
868 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) |
869 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
870 | SSE_MULT_PS(XMM6,XMM3) |
871 | SSE_SUB_PS(XMM5,XMM6) |
872 | |
873 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
874 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
875 | |
876 | SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) |
877 | |
878 | /* Third Column */ |
879 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
880 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
881 | |
882 | /* Matrix-Vector Product: */ |
883 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) |
884 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
885 | SSE_MULT_PS(XMM7,XMM0) |
886 | SSE_SUB_PS(XMM6,XMM7) |
887 | |
888 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) |
889 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
890 | SSE_MULT_PS(XMM4,XMM1) |
891 | SSE_SUB_PS(XMM6,XMM4) |
892 | |
893 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) |
894 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
895 | SSE_MULT_PS(XMM5,XMM2) |
896 | SSE_SUB_PS(XMM6,XMM5) |
897 | |
898 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) |
899 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
900 | SSE_MULT_PS(XMM7,XMM3) |
901 | SSE_SUB_PS(XMM6,XMM7) |
902 | |
903 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
904 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
905 | |
906 | /* Fourth Column */ |
907 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
908 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
909 | |
910 | /* Matrix-Vector Product: */ |
911 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) |
912 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
913 | SSE_MULT_PS(XMM5,XMM0) |
914 | SSE_SUB_PS(XMM4,XMM5) |
915 | |
916 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) |
917 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
918 | SSE_MULT_PS(XMM6,XMM1) |
919 | SSE_SUB_PS(XMM4,XMM6) |
920 | |
921 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) |
922 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
923 | SSE_MULT_PS(XMM7,XMM2) |
924 | SSE_SUB_PS(XMM4,XMM7) |
925 | |
926 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) |
927 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
928 | SSE_MULT_PS(XMM5,XMM3) |
929 | SSE_SUB_PS(XMM4,XMM5) |
930 | |
931 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
932 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
933 | SSE_INLINE_END_2; |
934 | pv += 16; |
935 | } |
936 | ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),936,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
937 | } |
938 | row = *bjtmp++; |
939 | /* row = (*bjtmp++)/4; */ |
940 | } |
941 | /* finished row so stick it into b->a */ |
942 | pv = ba + 16*bi[i]; |
943 | pj = bj + bi[i]; |
944 | nz = bi[i+1] - bi[i]; |
945 | |
946 | /* Copy x block back into pv block */ |
947 | for (j=0; j<nz; j++) { |
948 | x = rtmp+16*pj[j]; |
949 | /* x = rtmp+4*pj[j]; */ |
950 | |
951 | SSE_INLINE_BEGIN_2(x,pv) |
952 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
953 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) |
954 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) |
955 | |
956 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) |
957 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) |
958 | |
959 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) |
960 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) |
961 | |
962 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) |
963 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) |
964 | |
965 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) |
966 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) |
967 | |
968 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
969 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
970 | |
971 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
972 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) |
973 | |
974 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
975 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
976 | SSE_INLINE_END_2; |
977 | pv += 16; |
978 | } |
979 | /* invert diagonal block */ |
980 | w = ba + 16*diag_offset[i]; |
981 | if (pivotinblocks) { |
982 | ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),982,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
983 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
984 | } else { |
985 | ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *= -d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= - d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d ; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[ 7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),985,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
986 | } |
987 | /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ |
988 | /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ |
989 | } |
990 | |
991 | ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),991,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) || ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),991,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
992 | |
993 | C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; |
994 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; |
995 | C->assembled = PETSC_TRUE; |
996 | |
997 | ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),997,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
998 | /* Flop Count from inverting diagonal blocks */ |
999 | SSE_SCOPE_END; |
1000 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
1001 | } |
1002 | |
1003 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) |
1004 | { |
1005 | Mat A =C; |
1006 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; |
1007 | PetscErrorCode ierr; |
1008 | int i,j,n = a->mbs; |
1009 | unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; |
1010 | unsigned short *aj = (unsigned short*)(a->j),*ajtmp; |
1011 | unsigned int row; |
1012 | int nz,*bi=b->i; |
1013 | int *diag_offset = b->diag,*ai=a->i; |
1014 | MatScalar *pv,*v,*rtmp,*pc,*w,*x; |
1015 | MatScalar *ba = b->a,*aa = a->a; |
1016 | int nonzero=0; |
1017 | PetscBool pivotinblocks = b->pivotinblocks; |
1018 | PetscReal shift = info->shiftamount; |
1019 | PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; |
1020 | |
1021 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 1021; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
1022 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
1023 | SSE_SCOPE_BEGIN; |
1024 | |
1025 | if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1025,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." ); |
1026 | if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1026,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." ); |
1027 | ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1027,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1027,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1028 | if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1028,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." ); |
1029 | /* if ((unsigned long)bj==(unsigned long)aj) { */ |
1030 | /* colscale = 4; */ |
1031 | /* } */ |
1032 | |
1033 | for (i=0; i<n; i++) { |
1034 | nz = bi[i+1] - bi[i]; |
1035 | bjtmp = bj + bi[i]; |
1036 | /* zero out the 4x4 block accumulators */ |
1037 | /* zero out one register */ |
1038 | XOR_PS(XMM7,XMM7); |
1039 | for (j=0; j<nz; j++) { |
1040 | x = rtmp+16*((unsigned int)bjtmp[j]); |
1041 | /* x = rtmp+4*bjtmp[j]; */ |
1042 | SSE_INLINE_BEGIN_1(x) |
1043 | /* Copy zero register to memory locations */ |
1044 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1045 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) |
1046 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) |
1047 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) |
1048 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) |
1049 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) |
1050 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) |
1051 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
1052 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) |
1053 | SSE_INLINE_END_1; |
1054 | } |
1055 | /* load in initial (unfactored row) */ |
1056 | nz = ai[i+1] - ai[i]; |
1057 | ajtmp = aj + ai[i]; |
1058 | v = aa + 16*ai[i]; |
1059 | for (j=0; j<nz; j++) { |
1060 | x = rtmp+16*((unsigned int)ajtmp[j]); |
1061 | /* x = rtmp+colscale*ajtmp[j]; */ |
1062 | /* Copy v block into x block */ |
1063 | SSE_INLINE_BEGIN_2(v,x) |
1064 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1065 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
1066 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) |
1067 | |
1068 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) |
1069 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) |
1070 | |
1071 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) |
1072 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) |
1073 | |
1074 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) |
1075 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) |
1076 | |
1077 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) |
1078 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) |
1079 | |
1080 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) |
1081 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) |
1082 | |
1083 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) |
1084 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) |
1085 | |
1086 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
1087 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1088 | SSE_INLINE_END_2; |
1089 | |
1090 | v += 16; |
1091 | } |
1092 | /* row = (*bjtmp++)/4; */ |
1093 | row = (unsigned int)(*bjtmp++); |
1094 | while (row < i) { |
1095 | pc = rtmp + 16*row; |
1096 | SSE_INLINE_BEGIN_1(pc) |
1097 | /* Load block from lower triangle */ |
1098 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1099 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
1100 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) |
1101 | |
1102 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) |
1103 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) |
1104 | |
1105 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) |
1106 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) |
1107 | |
1108 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) |
1109 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) |
1110 | |
1111 | /* Compare block to zero block */ |
1112 | |
1113 | SSE_COPY_PS(XMM4,XMM7) |
1114 | SSE_CMPNEQ_PS(XMM4,XMM0) |
1115 | |
1116 | SSE_COPY_PS(XMM5,XMM7) |
1117 | SSE_CMPNEQ_PS(XMM5,XMM1) |
1118 | |
1119 | SSE_COPY_PS(XMM6,XMM7) |
1120 | SSE_CMPNEQ_PS(XMM6,XMM2) |
1121 | |
1122 | SSE_CMPNEQ_PS(XMM7,XMM3) |
1123 | |
1124 | /* Reduce the comparisons to one SSE register */ |
1125 | SSE_OR_PS(XMM6,XMM7) |
1126 | SSE_OR_PS(XMM5,XMM4) |
1127 | SSE_OR_PS(XMM5,XMM6) |
1128 | SSE_INLINE_END_1; |
1129 | |
1130 | /* Reduce the one SSE register to an integer register for branching */ |
1131 | /* Note: Since nonzero is an int, there is no INLINE block version of this call */ |
1132 | MOVEMASK(nonzero,XMM5); |
1133 | |
1134 | /* If block is nonzero ... */ |
1135 | if (nonzero) { |
1136 | pv = ba + 16*diag_offset[row]; |
1137 | PREFETCH_L1(&pv[16]); |
1138 | pj = bj + diag_offset[row] + 1; |
1139 | |
1140 | /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ |
1141 | /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ |
1142 | /* but the diagonal was inverted already */ |
1143 | /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ |
1144 | |
1145 | SSE_INLINE_BEGIN_2(pv,pc) |
1146 | /* Column 0, product is accumulated in XMM4 */ |
1147 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) |
1148 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1149 | SSE_MULT_PS(XMM4,XMM0) |
1150 | |
1151 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) |
1152 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1153 | SSE_MULT_PS(XMM5,XMM1) |
1154 | SSE_ADD_PS(XMM4,XMM5) |
1155 | |
1156 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) |
1157 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1158 | SSE_MULT_PS(XMM6,XMM2) |
1159 | SSE_ADD_PS(XMM4,XMM6) |
1160 | |
1161 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) |
1162 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1163 | SSE_MULT_PS(XMM7,XMM3) |
1164 | SSE_ADD_PS(XMM4,XMM7) |
1165 | |
1166 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) |
1167 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) |
1168 | |
1169 | /* Column 1, product is accumulated in XMM5 */ |
1170 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) |
1171 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1172 | SSE_MULT_PS(XMM5,XMM0) |
1173 | |
1174 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) |
1175 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1176 | SSE_MULT_PS(XMM6,XMM1) |
1177 | SSE_ADD_PS(XMM5,XMM6) |
1178 | |
1179 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) |
1180 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1181 | SSE_MULT_PS(XMM7,XMM2) |
1182 | SSE_ADD_PS(XMM5,XMM7) |
1183 | |
1184 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) |
1185 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1186 | SSE_MULT_PS(XMM6,XMM3) |
1187 | SSE_ADD_PS(XMM5,XMM6) |
1188 | |
1189 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) |
1190 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) |
1191 | |
1192 | SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) |
1193 | |
1194 | /* Column 2, product is accumulated in XMM6 */ |
1195 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) |
1196 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1197 | SSE_MULT_PS(XMM6,XMM0) |
1198 | |
1199 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) |
1200 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1201 | SSE_MULT_PS(XMM7,XMM1) |
1202 | SSE_ADD_PS(XMM6,XMM7) |
1203 | |
1204 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) |
1205 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1206 | SSE_MULT_PS(XMM7,XMM2) |
1207 | SSE_ADD_PS(XMM6,XMM7) |
1208 | |
1209 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) |
1210 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1211 | SSE_MULT_PS(XMM7,XMM3) |
1212 | SSE_ADD_PS(XMM6,XMM7) |
1213 | |
1214 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) |
1215 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
1216 | |
1217 | /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ |
1218 | /* Column 3, product is accumulated in XMM0 */ |
1219 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) |
1220 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1221 | SSE_MULT_PS(XMM0,XMM7) |
1222 | |
1223 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) |
1224 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1225 | SSE_MULT_PS(XMM1,XMM7) |
1226 | SSE_ADD_PS(XMM0,XMM1) |
1227 | |
1228 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) |
1229 | SSE_SHUFFLE(XMM1,XMM1,0x00) |
1230 | SSE_MULT_PS(XMM1,XMM2) |
1231 | SSE_ADD_PS(XMM0,XMM1) |
1232 | |
1233 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) |
1234 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1235 | SSE_MULT_PS(XMM3,XMM7) |
1236 | SSE_ADD_PS(XMM0,XMM3) |
1237 | |
1238 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) |
1239 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1240 | |
1241 | /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ |
1242 | /* This is code to be maintained and read by humans afterall. */ |
1243 | /* Copy Multiplier Col 3 into XMM3 */ |
1244 | SSE_COPY_PS(XMM3,XMM0) |
1245 | /* Copy Multiplier Col 2 into XMM2 */ |
1246 | SSE_COPY_PS(XMM2,XMM6) |
1247 | /* Copy Multiplier Col 1 into XMM1 */ |
1248 | SSE_COPY_PS(XMM1,XMM5) |
1249 | /* Copy Multiplier Col 0 into XMM0 */ |
1250 | SSE_COPY_PS(XMM0,XMM4) |
1251 | SSE_INLINE_END_2; |
1252 | |
1253 | /* Update the row: */ |
1254 | nz = bi[row+1] - diag_offset[row] - 1; |
1255 | pv += 16; |
1256 | for (j=0; j<nz; j++) { |
1257 | PREFETCH_L1(&pv[16]); |
1258 | x = rtmp + 16*((unsigned int)pj[j]); |
1259 | /* x = rtmp + 4*pj[j]; */ |
1260 | |
1261 | /* X:=X-M*PV, One column at a time */ |
1262 | /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ |
1263 | SSE_INLINE_BEGIN_2(x,pv) |
1264 | /* Load First Column of X*/ |
1265 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
1266 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
1267 | |
1268 | /* Matrix-Vector Product: */ |
1269 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) |
1270 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1271 | SSE_MULT_PS(XMM5,XMM0) |
1272 | SSE_SUB_PS(XMM4,XMM5) |
1273 | |
1274 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) |
1275 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1276 | SSE_MULT_PS(XMM6,XMM1) |
1277 | SSE_SUB_PS(XMM4,XMM6) |
1278 | |
1279 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) |
1280 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1281 | SSE_MULT_PS(XMM7,XMM2) |
1282 | SSE_SUB_PS(XMM4,XMM7) |
1283 | |
1284 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) |
1285 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1286 | SSE_MULT_PS(XMM5,XMM3) |
1287 | SSE_SUB_PS(XMM4,XMM5) |
1288 | |
1289 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
1290 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
1291 | |
1292 | /* Second Column */ |
1293 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
1294 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
1295 | |
1296 | /* Matrix-Vector Product: */ |
1297 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) |
1298 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1299 | SSE_MULT_PS(XMM6,XMM0) |
1300 | SSE_SUB_PS(XMM5,XMM6) |
1301 | |
1302 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) |
1303 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1304 | SSE_MULT_PS(XMM7,XMM1) |
1305 | SSE_SUB_PS(XMM5,XMM7) |
1306 | |
1307 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) |
1308 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1309 | SSE_MULT_PS(XMM4,XMM2) |
1310 | SSE_SUB_PS(XMM5,XMM4) |
1311 | |
1312 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) |
1313 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1314 | SSE_MULT_PS(XMM6,XMM3) |
1315 | SSE_SUB_PS(XMM5,XMM6) |
1316 | |
1317 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
1318 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
1319 | |
1320 | SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) |
1321 | |
1322 | /* Third Column */ |
1323 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
1324 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1325 | |
1326 | /* Matrix-Vector Product: */ |
1327 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) |
1328 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1329 | SSE_MULT_PS(XMM7,XMM0) |
1330 | SSE_SUB_PS(XMM6,XMM7) |
1331 | |
1332 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) |
1333 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1334 | SSE_MULT_PS(XMM4,XMM1) |
1335 | SSE_SUB_PS(XMM6,XMM4) |
1336 | |
1337 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) |
1338 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1339 | SSE_MULT_PS(XMM5,XMM2) |
1340 | SSE_SUB_PS(XMM6,XMM5) |
1341 | |
1342 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) |
1343 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1344 | SSE_MULT_PS(XMM7,XMM3) |
1345 | SSE_SUB_PS(XMM6,XMM7) |
1346 | |
1347 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
1348 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1349 | |
1350 | /* Fourth Column */ |
1351 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
1352 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
1353 | |
1354 | /* Matrix-Vector Product: */ |
1355 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) |
1356 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1357 | SSE_MULT_PS(XMM5,XMM0) |
1358 | SSE_SUB_PS(XMM4,XMM5) |
1359 | |
1360 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) |
1361 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1362 | SSE_MULT_PS(XMM6,XMM1) |
1363 | SSE_SUB_PS(XMM4,XMM6) |
1364 | |
1365 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) |
1366 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1367 | SSE_MULT_PS(XMM7,XMM2) |
1368 | SSE_SUB_PS(XMM4,XMM7) |
1369 | |
1370 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) |
1371 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1372 | SSE_MULT_PS(XMM5,XMM3) |
1373 | SSE_SUB_PS(XMM4,XMM5) |
1374 | |
1375 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
1376 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
1377 | SSE_INLINE_END_2; |
1378 | pv += 16; |
1379 | } |
1380 | ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1380,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1381 | } |
1382 | row = (unsigned int)(*bjtmp++); |
1383 | /* row = (*bjtmp++)/4; */ |
1384 | /* bjtmp++; */ |
1385 | } |
1386 | /* finished row so stick it into b->a */ |
1387 | pv = ba + 16*bi[i]; |
1388 | pj = bj + bi[i]; |
1389 | nz = bi[i+1] - bi[i]; |
1390 | |
1391 | /* Copy x block back into pv block */ |
1392 | for (j=0; j<nz; j++) { |
1393 | x = rtmp+16*((unsigned int)pj[j]); |
1394 | /* x = rtmp+4*pj[j]; */ |
1395 | |
1396 | SSE_INLINE_BEGIN_2(x,pv) |
1397 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1398 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) |
1399 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) |
1400 | |
1401 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) |
1402 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) |
1403 | |
1404 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) |
1405 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) |
1406 | |
1407 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) |
1408 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) |
1409 | |
1410 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) |
1411 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) |
1412 | |
1413 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1414 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
1415 | |
1416 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
1417 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) |
1418 | |
1419 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
1420 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1421 | SSE_INLINE_END_2; |
1422 | pv += 16; |
1423 | } |
1424 | /* invert diagonal block */ |
1425 | w = ba + 16*diag_offset[i]; |
1426 | if (pivotinblocks) { |
1427 | ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1427,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1428 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
1429 | } else { |
1430 | ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *= -d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= - d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d ; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[ 7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1430,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1431 | } |
1432 | /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ |
1433 | } |
1434 | |
1435 | ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),1435,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) || ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1435,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1436 | |
1437 | C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; |
1438 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; |
1439 | C->assembled = PETSC_TRUE; |
1440 | |
1441 | ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1441,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1442 | /* Flop Count from inverting diagonal blocks */ |
1443 | SSE_SCOPE_END; |
1444 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
1445 | } |
1446 | |
1447 | PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) |
1448 | { |
1449 | Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; |
1450 | PetscErrorCode ierr; |
1451 | int i,j,n = a->mbs; |
1452 | unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; |
1453 | unsigned int row; |
1454 | int *ajtmpold,nz,*bi=b->i; |
1455 | int *diag_offset = b->diag,*ai=a->i,*aj=a->j; |
1456 | MatScalar *pv,*v,*rtmp,*pc,*w,*x; |
1457 | MatScalar *ba = b->a,*aa = a->a; |
1458 | int nonzero=0; |
1459 | PetscBool pivotinblocks = b->pivotinblocks; |
1460 | PetscReal shift = info->shiftamount; |
1461 | PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; |
1462 | |
1463 | PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 1463; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE || petscstack->hotdepth); } ; } while (0) ; ; } while (0); |
1464 | allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE); |
1465 | SSE_SCOPE_BEGIN; |
1466 | |
1467 | if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1467,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." ); |
1468 | if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1468,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." ); |
1469 | ierr = PetscMalloc1(16*(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1469,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16*(n+1))*sizeof(**(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1469,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1470 | if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1470,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." ); |
1471 | /* if ((unsigned long)bj==(unsigned long)aj) { */ |
1472 | /* colscale = 4; */ |
1473 | /* } */ |
1474 | if ((unsigned long)bj==(unsigned long)aj) { |
1475 | return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); |
1476 | } |
1477 | |
1478 | for (i=0; i<n; i++) { |
1479 | nz = bi[i+1] - bi[i]; |
1480 | bjtmp = bj + bi[i]; |
1481 | /* zero out the 4x4 block accumulators */ |
1482 | /* zero out one register */ |
1483 | XOR_PS(XMM7,XMM7); |
1484 | for (j=0; j<nz; j++) { |
1485 | x = rtmp+16*((unsigned int)bjtmp[j]); |
1486 | /* x = rtmp+4*bjtmp[j]; */ |
1487 | SSE_INLINE_BEGIN_1(x) |
1488 | /* Copy zero register to memory locations */ |
1489 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1490 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) |
1491 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) |
1492 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) |
1493 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) |
1494 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) |
1495 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) |
1496 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
1497 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) |
1498 | SSE_INLINE_END_1; |
1499 | } |
1500 | /* load in initial (unfactored row) */ |
1501 | nz = ai[i+1] - ai[i]; |
1502 | ajtmpold = aj + ai[i]; |
1503 | v = aa + 16*ai[i]; |
1504 | for (j=0; j<nz; j++) { |
1505 | x = rtmp+16*ajtmpold[j]; |
1506 | /* x = rtmp+colscale*ajtmpold[j]; */ |
1507 | /* Copy v block into x block */ |
1508 | SSE_INLINE_BEGIN_2(v,x) |
1509 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1510 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
1511 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) |
1512 | |
1513 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) |
1514 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) |
1515 | |
1516 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) |
1517 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) |
1518 | |
1519 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) |
1520 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) |
1521 | |
1522 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) |
1523 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) |
1524 | |
1525 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) |
1526 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) |
1527 | |
1528 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) |
1529 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) |
1530 | |
1531 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
1532 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1533 | SSE_INLINE_END_2; |
1534 | |
1535 | v += 16; |
1536 | } |
1537 | /* row = (*bjtmp++)/4; */ |
1538 | row = (unsigned int)(*bjtmp++); |
1539 | while (row < i) { |
1540 | pc = rtmp + 16*row; |
1541 | SSE_INLINE_BEGIN_1(pc) |
1542 | /* Load block from lower triangle */ |
1543 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1544 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) |
1545 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) |
1546 | |
1547 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) |
1548 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) |
1549 | |
1550 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) |
1551 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) |
1552 | |
1553 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) |
1554 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) |
1555 | |
1556 | /* Compare block to zero block */ |
1557 | |
1558 | SSE_COPY_PS(XMM4,XMM7) |
1559 | SSE_CMPNEQ_PS(XMM4,XMM0) |
1560 | |
1561 | SSE_COPY_PS(XMM5,XMM7) |
1562 | SSE_CMPNEQ_PS(XMM5,XMM1) |
1563 | |
1564 | SSE_COPY_PS(XMM6,XMM7) |
1565 | SSE_CMPNEQ_PS(XMM6,XMM2) |
1566 | |
1567 | SSE_CMPNEQ_PS(XMM7,XMM3) |
1568 | |
1569 | /* Reduce the comparisons to one SSE register */ |
1570 | SSE_OR_PS(XMM6,XMM7) |
1571 | SSE_OR_PS(XMM5,XMM4) |
1572 | SSE_OR_PS(XMM5,XMM6) |
1573 | SSE_INLINE_END_1; |
1574 | |
1575 | /* Reduce the one SSE register to an integer register for branching */ |
1576 | /* Note: Since nonzero is an int, there is no INLINE block version of this call */ |
1577 | MOVEMASK(nonzero,XMM5); |
1578 | |
1579 | /* If block is nonzero ... */ |
1580 | if (nonzero) { |
1581 | pv = ba + 16*diag_offset[row]; |
1582 | PREFETCH_L1(&pv[16]); |
1583 | pj = bj + diag_offset[row] + 1; |
1584 | |
1585 | /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ |
1586 | /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ |
1587 | /* but the diagonal was inverted already */ |
1588 | /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ |
1589 | |
1590 | SSE_INLINE_BEGIN_2(pv,pc) |
1591 | /* Column 0, product is accumulated in XMM4 */ |
1592 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) |
1593 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1594 | SSE_MULT_PS(XMM4,XMM0) |
1595 | |
1596 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) |
1597 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1598 | SSE_MULT_PS(XMM5,XMM1) |
1599 | SSE_ADD_PS(XMM4,XMM5) |
1600 | |
1601 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) |
1602 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1603 | SSE_MULT_PS(XMM6,XMM2) |
1604 | SSE_ADD_PS(XMM4,XMM6) |
1605 | |
1606 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) |
1607 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1608 | SSE_MULT_PS(XMM7,XMM3) |
1609 | SSE_ADD_PS(XMM4,XMM7) |
1610 | |
1611 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) |
1612 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) |
1613 | |
1614 | /* Column 1, product is accumulated in XMM5 */ |
1615 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) |
1616 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1617 | SSE_MULT_PS(XMM5,XMM0) |
1618 | |
1619 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) |
1620 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1621 | SSE_MULT_PS(XMM6,XMM1) |
1622 | SSE_ADD_PS(XMM5,XMM6) |
1623 | |
1624 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) |
1625 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1626 | SSE_MULT_PS(XMM7,XMM2) |
1627 | SSE_ADD_PS(XMM5,XMM7) |
1628 | |
1629 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) |
1630 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1631 | SSE_MULT_PS(XMM6,XMM3) |
1632 | SSE_ADD_PS(XMM5,XMM6) |
1633 | |
1634 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) |
1635 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) |
1636 | |
1637 | SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) |
1638 | |
1639 | /* Column 2, product is accumulated in XMM6 */ |
1640 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) |
1641 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1642 | SSE_MULT_PS(XMM6,XMM0) |
1643 | |
1644 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) |
1645 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1646 | SSE_MULT_PS(XMM7,XMM1) |
1647 | SSE_ADD_PS(XMM6,XMM7) |
1648 | |
1649 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) |
1650 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1651 | SSE_MULT_PS(XMM7,XMM2) |
1652 | SSE_ADD_PS(XMM6,XMM7) |
1653 | |
1654 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) |
1655 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1656 | SSE_MULT_PS(XMM7,XMM3) |
1657 | SSE_ADD_PS(XMM6,XMM7) |
1658 | |
1659 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) |
1660 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
1661 | |
1662 | /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ |
1663 | /* Column 3, product is accumulated in XMM0 */ |
1664 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) |
1665 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1666 | SSE_MULT_PS(XMM0,XMM7) |
1667 | |
1668 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) |
1669 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1670 | SSE_MULT_PS(XMM1,XMM7) |
1671 | SSE_ADD_PS(XMM0,XMM1) |
1672 | |
1673 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) |
1674 | SSE_SHUFFLE(XMM1,XMM1,0x00) |
1675 | SSE_MULT_PS(XMM1,XMM2) |
1676 | SSE_ADD_PS(XMM0,XMM1) |
1677 | |
1678 | SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) |
1679 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1680 | SSE_MULT_PS(XMM3,XMM7) |
1681 | SSE_ADD_PS(XMM0,XMM3) |
1682 | |
1683 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) |
1684 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1685 | |
1686 | /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ |
1687 | /* This is code to be maintained and read by humans afterall. */ |
1688 | /* Copy Multiplier Col 3 into XMM3 */ |
1689 | SSE_COPY_PS(XMM3,XMM0) |
1690 | /* Copy Multiplier Col 2 into XMM2 */ |
1691 | SSE_COPY_PS(XMM2,XMM6) |
1692 | /* Copy Multiplier Col 1 into XMM1 */ |
1693 | SSE_COPY_PS(XMM1,XMM5) |
1694 | /* Copy Multiplier Col 0 into XMM0 */ |
1695 | SSE_COPY_PS(XMM0,XMM4) |
1696 | SSE_INLINE_END_2; |
1697 | |
1698 | /* Update the row: */ |
1699 | nz = bi[row+1] - diag_offset[row] - 1; |
1700 | pv += 16; |
1701 | for (j=0; j<nz; j++) { |
1702 | PREFETCH_L1(&pv[16]); |
1703 | x = rtmp + 16*((unsigned int)pj[j]); |
1704 | /* x = rtmp + 4*pj[j]; */ |
1705 | |
1706 | /* X:=X-M*PV, One column at a time */ |
1707 | /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ |
1708 | SSE_INLINE_BEGIN_2(x,pv) |
1709 | /* Load First Column of X*/ |
1710 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
1711 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
1712 | |
1713 | /* Matrix-Vector Product: */ |
1714 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) |
1715 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1716 | SSE_MULT_PS(XMM5,XMM0) |
1717 | SSE_SUB_PS(XMM4,XMM5) |
1718 | |
1719 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) |
1720 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1721 | SSE_MULT_PS(XMM6,XMM1) |
1722 | SSE_SUB_PS(XMM4,XMM6) |
1723 | |
1724 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) |
1725 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1726 | SSE_MULT_PS(XMM7,XMM2) |
1727 | SSE_SUB_PS(XMM4,XMM7) |
1728 | |
1729 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) |
1730 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1731 | SSE_MULT_PS(XMM5,XMM3) |
1732 | SSE_SUB_PS(XMM4,XMM5) |
1733 | |
1734 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) |
1735 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) |
1736 | |
1737 | /* Second Column */ |
1738 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
1739 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
1740 | |
1741 | /* Matrix-Vector Product: */ |
1742 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) |
1743 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1744 | SSE_MULT_PS(XMM6,XMM0) |
1745 | SSE_SUB_PS(XMM5,XMM6) |
1746 | |
1747 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) |
1748 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1749 | SSE_MULT_PS(XMM7,XMM1) |
1750 | SSE_SUB_PS(XMM5,XMM7) |
1751 | |
1752 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) |
1753 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1754 | SSE_MULT_PS(XMM4,XMM2) |
1755 | SSE_SUB_PS(XMM5,XMM4) |
1756 | |
1757 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) |
1758 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1759 | SSE_MULT_PS(XMM6,XMM3) |
1760 | SSE_SUB_PS(XMM5,XMM6) |
1761 | |
1762 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) |
1763 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) |
1764 | |
1765 | SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) |
1766 | |
1767 | /* Third Column */ |
1768 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
1769 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1770 | |
1771 | /* Matrix-Vector Product: */ |
1772 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) |
1773 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1774 | SSE_MULT_PS(XMM7,XMM0) |
1775 | SSE_SUB_PS(XMM6,XMM7) |
1776 | |
1777 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) |
1778 | SSE_SHUFFLE(XMM4,XMM4,0x00) |
1779 | SSE_MULT_PS(XMM4,XMM1) |
1780 | SSE_SUB_PS(XMM6,XMM4) |
1781 | |
1782 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) |
1783 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1784 | SSE_MULT_PS(XMM5,XMM2) |
1785 | SSE_SUB_PS(XMM6,XMM5) |
1786 | |
1787 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) |
1788 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1789 | SSE_MULT_PS(XMM7,XMM3) |
1790 | SSE_SUB_PS(XMM6,XMM7) |
1791 | |
1792 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) |
1793 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1794 | |
1795 | /* Fourth Column */ |
1796 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
1797 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
1798 | |
1799 | /* Matrix-Vector Product: */ |
1800 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) |
1801 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1802 | SSE_MULT_PS(XMM5,XMM0) |
1803 | SSE_SUB_PS(XMM4,XMM5) |
1804 | |
1805 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) |
1806 | SSE_SHUFFLE(XMM6,XMM6,0x00) |
1807 | SSE_MULT_PS(XMM6,XMM1) |
1808 | SSE_SUB_PS(XMM4,XMM6) |
1809 | |
1810 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) |
1811 | SSE_SHUFFLE(XMM7,XMM7,0x00) |
1812 | SSE_MULT_PS(XMM7,XMM2) |
1813 | SSE_SUB_PS(XMM4,XMM7) |
1814 | |
1815 | SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) |
1816 | SSE_SHUFFLE(XMM5,XMM5,0x00) |
1817 | SSE_MULT_PS(XMM5,XMM3) |
1818 | SSE_SUB_PS(XMM4,XMM5) |
1819 | |
1820 | SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) |
1821 | SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) |
1822 | SSE_INLINE_END_2; |
1823 | pv += 16; |
1824 | } |
1825 | ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1825,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1826 | } |
1827 | row = (unsigned int)(*bjtmp++); |
1828 | /* row = (*bjtmp++)/4; */ |
1829 | /* bjtmp++; */ |
1830 | } |
1831 | /* finished row so stick it into b->a */ |
1832 | pv = ba + 16*bi[i]; |
1833 | pj = bj + bi[i]; |
1834 | nz = bi[i+1] - bi[i]; |
1835 | |
1836 | /* Copy x block back into pv block */ |
1837 | for (j=0; j<nz; j++) { |
1838 | x = rtmp+16*((unsigned int)pj[j]); |
1839 | /* x = rtmp+4*pj[j]; */ |
1840 | |
1841 | SSE_INLINE_BEGIN_2(x,pv) |
1842 | /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ |
1843 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) |
1844 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) |
1845 | |
1846 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) |
1847 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) |
1848 | |
1849 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) |
1850 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) |
1851 | |
1852 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) |
1853 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) |
1854 | |
1855 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) |
1856 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) |
1857 | |
1858 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) |
1859 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) |
1860 | |
1861 | SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) |
1862 | SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) |
1863 | |
1864 | SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) |
1865 | SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) |
1866 | SSE_INLINE_END_2; |
1867 | pv += 16; |
1868 | } |
1869 | /* invert diagonal block */ |
1870 | w = ba + 16*diag_offset[i]; |
1871 | if (pivotinblocks) { |
1872 | ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1872,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1873 | if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; |
1874 | } else { |
1875 | ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] *= -d; w[8] *= -d; w[12] *= -d; w[1] *= d; w[2] *= d; w[3] *= d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] *= -d; w[9] *= -d; w[13] *= - d; w[4] *= d; w[6] *= d; w[7] *= d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] *= -d; w[6] *= -d; w[14] *= -d; w[8] *= d; w[9] *= d ; w[11] *= d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] *= -d; w[ 7] *= -d; w[11] *= -d; w[12] *= d; w[13] *= d; w[14] *= d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1875,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1876 | } |
1877 | /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ |
1878 | /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ |
1879 | } |
1880 | |
1881 | ierr = PetscFree(rtmp)((*PetscTrFree)((void*)(rtmp),1881,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) || ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1881,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1882 | |
1883 | C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; |
1884 | C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; |
1885 | C->assembled = PETSC_TRUE; |
1886 | |
1887 | ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1887,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); |
1888 | /* Flop Count from inverting diagonal blocks */ |
1889 | SSE_SCOPE_END; |
1890 | PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0); |
1891 | } |
1892 | |
1893 | #endif |