/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c

Bug Summary

File:	mat/impls/baij/seq/baijfact11.c
Warning:	line 507, column 9 Value stored to 'ierr' is never read

Annotated Source Code

[?] Use j/k keys for keyboard navigation

1
2	/*
3	Factorization code for BAIJ format.
4	*/
5	#include <../src/mat/impls/baij/seq/baij.h>
6	#include <petsc/private/kernels/blockinvert.h>
7
8	/* ------------------------------------------------------------*/
9	/*
10	Version for when blocks are 4 by 4
11	*/
12	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13	{
14	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b = (Mat_SeqBAIJ)C->data;
15	IS isrow = b->row,isicol = b->icol;
16	PetscErrorCode ierr;
17	const PetscInt r,ic;
18	PetscInt i,j,n = a->mbs,bi = b->i,bj = b->j;
19	PetscInt ajtmpold,ajtmp,nz,row;
20	PetscInt diag_offset = b->diag,idx,ai=a->i,aj=a->j,pj;
21	MatScalar pv,v,rtmp,pc,w,x;
22	MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23	MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24	MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25	MatScalar m13,m14,m15,m16;
26	MatScalar ba = b->a,aa = a->a;
27	PetscBool pivotinblocks = b->pivotinblocks;
28	PetscReal shift = info->shiftamount;
29	PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
30
31	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 31; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
32	ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),32,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
33	ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),33,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
34	ierr = PetscMalloc1(16(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,34,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16(n+1))sizeof(*(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),34,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
35	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
36
37	for (i=0; i<n; i++) {
38	nz = bi[i+1] - bi[i];
39	ajtmp = bj + bi[i];
40	for (j=0; j<nz; j++) {
41	x = rtmp+16*ajtmp[j];
42	x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
43	x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44	}
45	/* load in initial (unfactored row) */
46	idx = r[i];
47	nz = ai[idx+1] - ai[idx];
48	ajtmpold = aj + ai[idx];
49	v = aa + 16*ai[idx];
50	for (j=0; j<nz; j++) {
51	x = rtmp+16*ic[ajtmpold[j]];
52	x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
53	x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
54	x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55	x[14] = v[14]; x[15] = v[15];
56	v += 16;
57	}
58	row = *ajtmp++;
59	while (row < i) {
60	pc = rtmp + 16*row;
61	p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
62	p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
63	p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64	p15 = pc[14]; p16 = pc[15];
65	if (p1 != 0.0 \|\| p2 != 0.0 \|\| p3 != 0.0 \|\| p4 != 0.0 \|\| p5 != 0.0 \|\|
66	p6 != 0.0 \|\| p7 != 0.0 \|\| p8 != 0.0 \|\| p9 != 0.0 \|\| p10 != 0.0 \|\|
67	p11 != 0.0 \|\| p12 != 0.0 \|\| p13 != 0.0 \|\| p14 != 0.0 \|\| p15 != 0.0
68	\|\| p16 != 0.0) {
69	pv = ba + 16*diag_offset[row];
70	pj = bj + diag_offset[row] + 1;
71	x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
72	x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
73	x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74	x15 = pv[14]; x16 = pv[15];
75	pc[0] = m1 = p1x1 + p5x2 + p9x3 + p13x4;
76	pc[1] = m2 = p2x1 + p6x2 + p10x3 + p14x4;
77	pc[2] = m3 = p3x1 + p7x2 + p11x3 + p15x4;
78	pc[3] = m4 = p4x1 + p8x2 + p12x3 + p16x4;
79
80	pc[4] = m5 = p1x5 + p5x6 + p9x7 + p13x8;
81	pc[5] = m6 = p2x5 + p6x6 + p10x7 + p14x8;
82	pc[6] = m7 = p3x5 + p7x6 + p11x7 + p15x8;
83	pc[7] = m8 = p4x5 + p8x6 + p12x7 + p16x8;
84
85	pc[8] = m9 = p1x9 + p5x10 + p9x11 + p13x12;
86	pc[9] = m10 = p2x9 + p6x10 + p10x11 + p14x12;
87	pc[10] = m11 = p3x9 + p7x10 + p11x11 + p15x12;
88	pc[11] = m12 = p4x9 + p8x10 + p12x11 + p16x12;
89
90	pc[12] = m13 = p1x13 + p5x14 + p9x15 + p13x16;
91	pc[13] = m14 = p2x13 + p6x14 + p10x15 + p14x16;
92	pc[14] = m15 = p3x13 + p7x14 + p11x15 + p15x16;
93	pc[15] = m16 = p4x13 + p8x14 + p12x15 + p16x16;
94
95	nz = bi[row+1] - diag_offset[row] - 1;
96	pv += 16;
97	for (j=0; j<nz; j++) {
98	x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
99	x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
100	x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101	x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
102	x = rtmp + 16*pj[j];
103	x[0] -= m1x1 + m5x2 + m9x3 + m13x4;
104	x[1] -= m2x1 + m6x2 + m10x3 + m14x4;
105	x[2] -= m3x1 + m7x2 + m11x3 + m15x4;
106	x[3] -= m4x1 + m8x2 + m12x3 + m16x4;
107
108	x[4] -= m1x5 + m5x6 + m9x7 + m13x8;
109	x[5] -= m2x5 + m6x6 + m10x7 + m14x8;
110	x[6] -= m3x5 + m7x6 + m11x7 + m15x8;
111	x[7] -= m4x5 + m8x6 + m12x7 + m16x8;
112
113	x[8] -= m1x9 + m5x10 + m9x11 + m13x12;
114	x[9] -= m2x9 + m6x10 + m10x11 + m14x12;
115	x[10] -= m3x9 + m7x10 + m11x11 + m15x12;
116	x[11] -= m4x9 + m8x10 + m12x11 + m16x12;
117
118	x[12] -= m1x13 + m5x14 + m9x15 + m13x16;
119	x[13] -= m2x13 + m6x14 + m10x15 + m14x16;
120	x[14] -= m3x13 + m7x14 + m11x15 + m15x16;
121	x[15] -= m4x13 + m8x14 + m12x15 + m16x16;
122
123	pv += 16;
124	}
125	ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),125,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
126	}
127	row = *ajtmp++;
128	}
129	/* finished row so stick it into b->a */
130	pv = ba + 16*bi[i];
131	pj = bj + bi[i];
132	nz = bi[i+1] - bi[i];
133	for (j=0; j<nz; j++) {
134	x = rtmp+16*pj[j];
135	pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
136	pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
137	pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138	pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139	pv += 16;
140	}
141	/* invert diagonal block */
142	w = ba + 16*diag_offset[i];
143	if (pivotinblocks) {
144	ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),144,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
145	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146	} else {
147	ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] = -d; w[8] = -d; w[12] = -d; w[1] = d; w[2] = d; w[3] = d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] = -d; w[9] = -d; w[13] = - d; w[4] = d; w[6] = d; w[7] = d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] = -d; w[6] = -d; w[14] = -d; w[8] = d; w[9] = d ; w[11] = d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] = -d; w[ 7] = -d; w[11] = -d; w[12] = d; w[13] = d; w[14] = d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),147,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
148	}
149	}
150
151	ierr = PetscFree(rtmp)((PetscTrFree)((void)(rtmp),151,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) \|\| ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),151,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
152	ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),152,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
153	ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),153,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
154
155	C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
156	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157	C->assembled = PETSC_TRUE;
158
159	ierr = PetscLogFlops(1.333333333333444b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),159,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */
160	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
161	}
162
163	/* MatLUFactorNumeric_SeqBAIJ_4 -
164	copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165	PetscKernel_A_gets_A_times_B()
166	PetscKernel_A_gets_A_minus_B_times_C()
167	PetscKernel_A_gets_inverse_A()
168	*/
169
170	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171	{
172	Mat C = B;
173	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b=(Mat_SeqBAIJ)C->data;
174	IS isrow = b->row,isicol = b->icol;
175	PetscErrorCode ierr;
176	const PetscInt r,ic;
177	PetscInt i,j,k,nz,nzL,row;
178	const PetscInt n=a->mbs,ai=a->i,aj=a->j,bi=b->i,bj=b->j;
179	const PetscInt ajtmp,bjtmp,bdiag=b->diag,pj,bs2=a->bs2;
180	MatScalar rtmp,pc,mwork,v,pv,aa=a->a;
181	PetscInt flg;
182	PetscReal shift;
183	PetscBool allowzeropivot,zeropivotdetected;
184
185	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 185; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
186	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
187	ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),187,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
188	ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),188,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
189
190	if (info->shifttype == MAT_SHIFT_NONE) {
191	shift = 0;
192	} else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
193	shift = info->shiftamount;
194	}
195
196	/* generate work space needed by the factorization */
197	ierr = PetscMalloc2(bs2n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,197,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(bs2n)sizeof((&rtmp)),(&rtmp),(size_t)(bs2 )sizeof(**(&mwork)),(&mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),197,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
198	ierr = PetscArrayzero(rtmp,bs2n)PetscMemzero(rtmp,(bs2n)sizeof((rtmp)));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),198,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
199
200	for (i=0; i<n; i++) {
201	/* zero rtmp */
202	/* L part */
203	nz = bi[i+1] - bi[i];
204	bjtmp = bj + bi[i];
205	for (j=0; j<nz; j++) {
206	ierr = PetscArrayzero(rtmp+bs2bjtmp[j],bs2)PetscMemzero(rtmp+bs2bjtmp[j],(bs2)sizeof((rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),206,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
207	}
208
209	/* U part */
210	nz = bdiag[i]-bdiag[i+1];
211	bjtmp = bj + bdiag[i+1]+1;
212	for (j=0; j<nz; j++) {
213	ierr = PetscArrayzero(rtmp+bs2bjtmp[j],bs2)PetscMemzero(rtmp+bs2bjtmp[j],(bs2)sizeof((rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),213,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
214	}
215
216	/* load in initial (unfactored row) */
217	nz = ai[r[i]+1] - ai[r[i]];
218	ajtmp = aj + ai[r[i]];
219	v = aa + bs2*ai[r[i]];
220	for (j=0; j<nz; j++) {
221	ierr = PetscArraycpy(rtmp+bs2ic[ajtmp[j]],v+bs2j,bs2)((sizeof((rtmp+bs2ic[ajtmp[j]])) != sizeof((v+bs2j))) \|\| PetscMemcpy (rtmp+bs2ic[ajtmp[j]],v+bs2j,(bs2)sizeof((rtmp+bs2*ic[ajtmp [j]]))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),221,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
222	}
223
224	/* elimination */
225	bjtmp = bj + bi[i];
226	nzL = bi[i+1] - bi[i];
227	for (k=0; k < nzL; k++) {
228	row = bjtmp[k];
229	pc = rtmp + bs2*row;
230	for (flg=0,j=0; j<bs2; j++) {
231	if (pc[j]!=0.0) {
232	flg = 1;
233	break;
234	}
235	}
236	if (flg) {
237	pv = b->a + bs2*bdiag[row];
238	/* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); pc = pc * (pv); /
239	ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof((mwork)) != sizeof((pc))) \|\| PetscMemcpy (mwork,pc,(16)sizeof((mwork))));do {if (__builtin_expect(!! (ierr),0)) return PetscError(((MPI_Comm)0x44000001),239,__func__ ,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]pv [0] + mwork[4]pv[1] + mwork[8]pv[2] + mwork[12]pv[3]; pc[1 ] = mwork[1]pv[0] + mwork[5]pv[1] + mwork[9]pv[2] + mwork[ 13]pv[3]; pc[2] = mwork[2]pv[0] + mwork[6]pv[1] + mwork[10 ]pv[2] + mwork[14]pv[3]; pc[3] = mwork[3]pv[0] + mwork[7] pv[1] + mwork[11]pv[2] + mwork[15]pv[3]; pc[4] = mwork[0]pv [4] + mwork[4]pv[5] + mwork[8]pv[6] + mwork[12]pv[7]; pc[5 ] = mwork[1]pv[4] + mwork[5]pv[5] + mwork[9]pv[6] + mwork[ 13]pv[7]; pc[6] = mwork[2]pv[4] + mwork[6]pv[5] + mwork[10 ]pv[6] + mwork[14]pv[7]; pc[7] = mwork[3]pv[4] + mwork[7] pv[5] + mwork[11]pv[6] + mwork[15]pv[7]; pc[8] = mwork[0]pv [8] + mwork[4]pv[9] + mwork[8]pv[10] + mwork[12]pv[11]; pc [9] = mwork[1]pv[8] + mwork[5]pv[9] + mwork[9]pv[10] + mwork [13]pv[11]; pc[10] = mwork[2]pv[8] + mwork[6]pv[9] + mwork [10]pv[10] + mwork[14]pv[11]; pc[11] = mwork[3]pv[8] + mwork [7]pv[9] + mwork[11]pv[10] + mwork[15]pv[11]; pc[12] = mwork [0]pv[12] + mwork[4]pv[13] + mwork[8]pv[14] + mwork[12]pv [15]; pc[13] = mwork[1]pv[12] + mwork[5]pv[13] + mwork[9]pv [14] + mwork[13]pv[15]; pc[14] = mwork[2]pv[12] + mwork[6] pv[13] + mwork[10]pv[14] + mwork[14]pv[15]; pc[15] = mwork[ 3]pv[12] + mwork[7]pv[13] + mwork[11]pv[14] + mwork[15]pv [15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),239,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
240
241	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
242	pv = b->a + bs2*(bdiag[row+1]+1);
243	nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
244	for (j=0; j<nz; j++) {
245	/* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2pj[j],pc,pv+bs2j); */
246	/* rtmp+bs2pj[j] = rtmp+bs2pj[j] - (pc)(pv+bs2j) /
247	v = rtmp + bs2*pj[j];
248	ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]pv[0] + pc[4]pv[1] + pc[8]pv[2] + pc[12] pv[3]; v[1] -= pc[1]pv[0] + pc[5]pv[1] + pc[9]pv[2] + pc[ 13]pv[3]; v[2] -= pc[2]pv[0] + pc[6]pv[1] + pc[10]pv[2] + pc[14]pv[3]; v[3] -= pc[3]pv[0] + pc[7]pv[1] + pc[11]pv[ 2] + pc[15]pv[3]; v[4] -= pc[0]pv[4] + pc[4]pv[5] + pc[8]* pv[6] + pc[12]pv[7]; v[5] -= pc[1]pv[4] + pc[5]pv[5] + pc[ 9]pv[6] + pc[13]pv[7]; v[6] -= pc[2]pv[4] + pc[6]pv[5] + pc [10]pv[6] + pc[14]pv[7]; v[7] -= pc[3]pv[4] + pc[7]pv[5] + pc[11]pv[6] + pc[15]pv[7]; v[8] -= pc[0]pv[8] + pc[4]pv[ 9] + pc[8]pv[10] + pc[12]pv[11]; v[9] -= pc[1]pv[8] + pc[5 ]pv[9] + pc[9]pv[10] + pc[13]pv[11]; v[10] -= pc[2]pv[8] + pc[6]pv[9] + pc[10]pv[10] + pc[14]pv[11]; v[11] -= pc[3] pv[8] + pc[7]pv[9] + pc[11]pv[10] + pc[15]pv[11]; v[12] -= pc[0]pv[12] + pc[4]pv[13] + pc[8]pv[14] + pc[12]pv[15]; v [13] -= pc[1]pv[12] + pc[5]pv[13] + pc[9]pv[14] + pc[13]pv [15]; v[14] -= pc[2]pv[12] + pc[6]pv[13] + pc[10]pv[14] + pc [14]pv[15]; v[15] -= pc[3]pv[12] + pc[7]pv[13] + pc[11]pv [14] + pc[15]*pv[15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),248,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
249	pv += bs2;
250	}
251	ierr = PetscLogFlops(128nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),251,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); / flops = 2bs^3nz + 2bs^3 - bs2) /
252	}
253	}
254
255	/* finished row so stick it into b->a */
256	/* L part */
257	pv = b->a + bs2*bi[i];
258	pj = b->j + bi[i];
259	nz = bi[i+1] - bi[i];
260	for (j=0; j<nz; j++) {
261	ierr = PetscArraycpy(pv+bs2j,rtmp+bs2pj[j],bs2)((sizeof((pv+bs2j)) != sizeof((rtmp+bs2pj[j]))) \|\| PetscMemcpy (pv+bs2j,rtmp+bs2pj[j],(bs2)sizeof((pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),261,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
262	}
263
264	/* Mark diagonal and invert diagonal for simplier triangular solves */
265	pv = b->a + bs2*bdiag[i];
266	pj = b->j + bdiag[i];
267	ierr = PetscArraycpy(pv,rtmp+bs2pj[0],bs2)((sizeof((pv)) != sizeof((rtmp+bs2pj[0]))) \|\| PetscMemcpy( pv,rtmp+bs2pj[0],(bs2)sizeof(*(pv))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),267,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
268	ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),268,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
269	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
270
271	/* U part */
272	pv = b->a + bs2*(bdiag[i+1]+1);
273	pj = b->j + bdiag[i+1]+1;
274	nz = bdiag[i] - bdiag[i+1] - 1;
275	for (j=0; j<nz; j++) {
276	ierr = PetscArraycpy(pv+bs2j,rtmp+bs2pj[j],bs2)((sizeof((pv+bs2j)) != sizeof((rtmp+bs2pj[j]))) \|\| PetscMemcpy (pv+bs2j,rtmp+bs2pj[j],(bs2)sizeof((pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),276,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
277	}
278	}
279
280	ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,280,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,&(rtmp),&(mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),280,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
281	ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),281,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
282	ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),282,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
283
284	C->ops->solve = MatSolve_SeqBAIJ_4;
285	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
286	C->assembled = PETSC_TRUE;
287
288	ierr = PetscLogFlops(1.333333333333444n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),288,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */
289	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
290	}
291
292	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293	{
294	/*
295	Default Version for when blocks are 4 by 4 Using natural ordering
296	*/
297	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b = (Mat_SeqBAIJ)C->data;
298	PetscErrorCode ierr;
299	PetscInt i,j,n = a->mbs,bi = b->i,bj = b->j;
300	PetscInt ajtmpold,ajtmp,nz,row;
301	PetscInt diag_offset = b->diag,ai=a->i,aj=a->j,pj;
302	MatScalar pv,v,rtmp,pc,w,x;
303	MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304	MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305	MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306	MatScalar m13,m14,m15,m16;
307	MatScalar ba = b->a,aa = a->a;
308	PetscBool pivotinblocks = b->pivotinblocks;
309	PetscReal shift = info->shiftamount;
310	PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
311
312	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 312; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
313	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
314	ierr = PetscMalloc1(16(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,314,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16(n+1))sizeof(*(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),314,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
315
316	for (i=0; i<n; i++) {
317	nz = bi[i+1] - bi[i];
318	ajtmp = bj + bi[i];
319	for (j=0; j<nz; j++) {
320	x = rtmp+16*ajtmp[j];
321	x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
322	x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
323	}
324	/* load in initial (unfactored row) */
325	nz = ai[i+1] - ai[i];
326	ajtmpold = aj + ai[i];
327	v = aa + 16*ai[i];
328	for (j=0; j<nz; j++) {
329	x = rtmp+16*ajtmpold[j];
330	x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
331	x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
332	x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
333	x[14] = v[14]; x[15] = v[15];
334	v += 16;
335	}
336	row = *ajtmp++;
337	while (row < i) {
338	pc = rtmp + 16*row;
339	p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
340	p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
341	p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
342	p15 = pc[14]; p16 = pc[15];
343	if (p1 != 0.0 \|\| p2 != 0.0 \|\| p3 != 0.0 \|\| p4 != 0.0 \|\| p5 != 0.0 \|\|
344	p6 != 0.0 \|\| p7 != 0.0 \|\| p8 != 0.0 \|\| p9 != 0.0 \|\| p10 != 0.0 \|\|
345	p11 != 0.0 \|\| p12 != 0.0 \|\| p13 != 0.0 \|\| p14 != 0.0 \|\| p15 != 0.0
346	\|\| p16 != 0.0) {
347	pv = ba + 16*diag_offset[row];
348	pj = bj + diag_offset[row] + 1;
349	x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
350	x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
351	x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
352	x15 = pv[14]; x16 = pv[15];
353	pc[0] = m1 = p1x1 + p5x2 + p9x3 + p13x4;
354	pc[1] = m2 = p2x1 + p6x2 + p10x3 + p14x4;
355	pc[2] = m3 = p3x1 + p7x2 + p11x3 + p15x4;
356	pc[3] = m4 = p4x1 + p8x2 + p12x3 + p16x4;
357
358	pc[4] = m5 = p1x5 + p5x6 + p9x7 + p13x8;
359	pc[5] = m6 = p2x5 + p6x6 + p10x7 + p14x8;
360	pc[6] = m7 = p3x5 + p7x6 + p11x7 + p15x8;
361	pc[7] = m8 = p4x5 + p8x6 + p12x7 + p16x8;
362
363	pc[8] = m9 = p1x9 + p5x10 + p9x11 + p13x12;
364	pc[9] = m10 = p2x9 + p6x10 + p10x11 + p14x12;
365	pc[10] = m11 = p3x9 + p7x10 + p11x11 + p15x12;
366	pc[11] = m12 = p4x9 + p8x10 + p12x11 + p16x12;
367
368	pc[12] = m13 = p1x13 + p5x14 + p9x15 + p13x16;
369	pc[13] = m14 = p2x13 + p6x14 + p10x15 + p14x16;
370	pc[14] = m15 = p3x13 + p7x14 + p11x15 + p15x16;
371	pc[15] = m16 = p4x13 + p8x14 + p12x15 + p16x16;
372	nz = bi[row+1] - diag_offset[row] - 1;
373	pv += 16;
374	for (j=0; j<nz; j++) {
375	x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
376	x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
377	x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
378	x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
379	x = rtmp + 16*pj[j];
380	x[0] -= m1x1 + m5x2 + m9x3 + m13x4;
381	x[1] -= m2x1 + m6x2 + m10x3 + m14x4;
382	x[2] -= m3x1 + m7x2 + m11x3 + m15x4;
383	x[3] -= m4x1 + m8x2 + m12x3 + m16x4;
384
385	x[4] -= m1x5 + m5x6 + m9x7 + m13x8;
386	x[5] -= m2x5 + m6x6 + m10x7 + m14x8;
387	x[6] -= m3x5 + m7x6 + m11x7 + m15x8;
388	x[7] -= m4x5 + m8x6 + m12x7 + m16x8;
389
390	x[8] -= m1x9 + m5x10 + m9x11 + m13x12;
391	x[9] -= m2x9 + m6x10 + m10x11 + m14x12;
392	x[10] -= m3x9 + m7x10 + m11x11 + m15x12;
393	x[11] -= m4x9 + m8x10 + m12x11 + m16x12;
394
395	x[12] -= m1x13 + m5x14 + m9x15 + m13x16;
396	x[13] -= m2x13 + m6x14 + m10x15 + m14x16;
397	x[14] -= m3x13 + m7x14 + m11x15 + m15x16;
398	x[15] -= m4x13 + m8x14 + m12x15 + m16x16;
399
400	pv += 16;
401	}
402	ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),402,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
403	}
404	row = *ajtmp++;
405	}
406	/* finished row so stick it into b->a */
407	pv = ba + 16*bi[i];
408	pj = bj + bi[i];
409	nz = bi[i+1] - bi[i];
410	for (j=0; j<nz; j++) {
411	x = rtmp+16*pj[j];
412	pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
413	pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
414	pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
415	pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
416	pv += 16;
417	}
418	/* invert diagonal block */
419	w = ba + 16*diag_offset[i];
420	if (pivotinblocks) {
421	ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),421,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
422	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
423	} else {
424	ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] = -d; w[8] = -d; w[12] = -d; w[1] = d; w[2] = d; w[3] = d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] = -d; w[9] = -d; w[13] = - d; w[4] = d; w[6] = d; w[7] = d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] = -d; w[6] = -d; w[14] = -d; w[8] = d; w[9] = d ; w[11] = d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] = -d; w[ 7] = -d; w[11] = -d; w[12] = d; w[13] = d; w[14] = d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),424,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
425	}
426	}
427
428	ierr = PetscFree(rtmp)((PetscTrFree)((void)(rtmp),428,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) \|\| ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),428,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
429
430	C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
431	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
432	C->assembled = PETSC_TRUE;
433
434	ierr = PetscLogFlops(1.333333333333444b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),434,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */
435	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
436	}
437
438	/*
439	MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
440	copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
441	*/
442	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
443	{
444	Mat C =B;
445	Mat_SeqBAIJ a=(Mat_SeqBAIJ)A->data,b=(Mat_SeqBAIJ)C->data;
446	PetscErrorCode ierr;
447	PetscInt i,j,k,nz,nzL,row;
448	const PetscInt n=a->mbs,ai=a->i,aj=a->j,bi=b->i,bj=b->j;
449	const PetscInt ajtmp,bjtmp,bdiag=b->diag,pj,bs2=a->bs2;
450	MatScalar rtmp,pc,mwork,v,pv,aa=a->a;
451	PetscInt flg;
452	PetscReal shift;
453	PetscBool allowzeropivot,zeropivotdetected;
454
455	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 455; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
456	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
457
458	/* generate work space needed by the factorization */
459	ierr = PetscMalloc2(bs2n,&rtmp,bs2,&mwork)PetscMallocA(2,PETSC_FALSE,459,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(bs2n)sizeof((&rtmp)),(&rtmp),(size_t)(bs2 )sizeof(**(&mwork)),(&mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),459,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
460	ierr = PetscArrayzero(rtmp,bs2n)PetscMemzero(rtmp,(bs2n)sizeof((rtmp)));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),460,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
461
462	if (info->shifttype == MAT_SHIFT_NONE) {
463	shift = 0;
464	} else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
465	shift = info->shiftamount;
466	}
467
468	for (i=0; i<n; i++) {
469	/* zero rtmp */
470	/* L part */
471	nz = bi[i+1] - bi[i];
472	bjtmp = bj + bi[i];
473	for (j=0; j<nz; j++) {
474	ierr = PetscArrayzero(rtmp+bs2bjtmp[j],bs2)PetscMemzero(rtmp+bs2bjtmp[j],(bs2)sizeof((rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),474,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
475	}
476
477	/* U part */
478	nz = bdiag[i] - bdiag[i+1];
479	bjtmp = bj + bdiag[i+1]+1;
480	for (j=0; j<nz; j++) {
481	ierr = PetscArrayzero(rtmp+bs2bjtmp[j],bs2)PetscMemzero(rtmp+bs2bjtmp[j],(bs2)sizeof((rtmp+bs2*bjtmp[ j])));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),481,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
482	}
483
484	/* load in initial (unfactored row) */
485	nz = ai[i+1] - ai[i];
486	ajtmp = aj + ai[i];
487	v = aa + bs2*ai[i];
488	for (j=0; j<nz; j++) {
489	ierr = PetscArraycpy(rtmp+bs2ajtmp[j],v+bs2j,bs2)((sizeof((rtmp+bs2ajtmp[j])) != sizeof((v+bs2j))) \|\| PetscMemcpy (rtmp+bs2ajtmp[j],v+bs2j,(bs2)sizeof((rtmp+bs2*ajtmp[j])) ));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),489,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
490	}
491
492	/* elimination */
493	bjtmp = bj + bi[i];
494	nzL = bi[i+1] - bi[i];
495	for (k=0; k < nzL; k++) {
496	row = bjtmp[k];
497	pc = rtmp + bs2*row;
498	for (flg=0,j=0; j<bs2; j++) {
499	if (pc[j]!=0.0) {
500	flg = 1;
501	break;
502	}
503	}
504	if (flg) {
505	pv = b->a + bs2*bdiag[row];
506	/* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); pc = pc * (pv); /
507	ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)0; { ierr = ((sizeof((mwork)) != sizeof((pc))) \|\| PetscMemcpy (mwork,pc,(16)sizeof((mwork))));do {if (__builtin_expect(!! (ierr),0)) return PetscError(((MPI_Comm)0x44000001),507,__func__ ,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); pc[0] = mwork[0]pv [0] + mwork[4]pv[1] + mwork[8]pv[2] + mwork[12]pv[3]; pc[1 ] = mwork[1]pv[0] + mwork[5]pv[1] + mwork[9]pv[2] + mwork[ 13]pv[3]; pc[2] = mwork[2]pv[0] + mwork[6]pv[1] + mwork[10 ]pv[2] + mwork[14]pv[3]; pc[3] = mwork[3]pv[0] + mwork[7] pv[1] + mwork[11]pv[2] + mwork[15]pv[3]; pc[4] = mwork[0]pv [4] + mwork[4]pv[5] + mwork[8]pv[6] + mwork[12]pv[7]; pc[5 ] = mwork[1]pv[4] + mwork[5]pv[5] + mwork[9]pv[6] + mwork[ 13]pv[7]; pc[6] = mwork[2]pv[4] + mwork[6]pv[5] + mwork[10 ]pv[6] + mwork[14]pv[7]; pc[7] = mwork[3]pv[4] + mwork[7] pv[5] + mwork[11]pv[6] + mwork[15]pv[7]; pc[8] = mwork[0]pv [8] + mwork[4]pv[9] + mwork[8]pv[10] + mwork[12]pv[11]; pc [9] = mwork[1]pv[8] + mwork[5]pv[9] + mwork[9]pv[10] + mwork [13]pv[11]; pc[10] = mwork[2]pv[8] + mwork[6]pv[9] + mwork [10]pv[10] + mwork[14]pv[11]; pc[11] = mwork[3]pv[8] + mwork [7]pv[9] + mwork[11]pv[10] + mwork[15]pv[11]; pc[12] = mwork [0]pv[12] + mwork[4]pv[13] + mwork[8]pv[14] + mwork[12]pv [15]; pc[13] = mwork[1]pv[12] + mwork[5]pv[13] + mwork[9]pv [14] + mwork[13]pv[15]; pc[14] = mwork[2]pv[12] + mwork[6] pv[13] + mwork[10]pv[14] + mwork[14]pv[15]; pc[15] = mwork[ 3]pv[12] + mwork[7]pv[13] + mwork[11]pv[14] + mwork[15]pv [15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),507,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
	Value stored to 'ierr' is never read
508
509	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
510	pv = b->a + bs2*(bdiag[row+1]+1);
511	nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
512	for (j=0; j<nz; j++) {
513	/* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2pj[j],pc,pv+bs2j); */
514	/* rtmp+bs2pj[j] = rtmp+bs2pj[j] - (pc)(pv+bs2j) /
515	v = rtmp + bs2*pj[j];
516	ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)0; { v[0] -= pc[0]pv[0] + pc[4]pv[1] + pc[8]pv[2] + pc[12] pv[3]; v[1] -= pc[1]pv[0] + pc[5]pv[1] + pc[9]pv[2] + pc[ 13]pv[3]; v[2] -= pc[2]pv[0] + pc[6]pv[1] + pc[10]pv[2] + pc[14]pv[3]; v[3] -= pc[3]pv[0] + pc[7]pv[1] + pc[11]pv[ 2] + pc[15]pv[3]; v[4] -= pc[0]pv[4] + pc[4]pv[5] + pc[8]* pv[6] + pc[12]pv[7]; v[5] -= pc[1]pv[4] + pc[5]pv[5] + pc[ 9]pv[6] + pc[13]pv[7]; v[6] -= pc[2]pv[4] + pc[6]pv[5] + pc [10]pv[6] + pc[14]pv[7]; v[7] -= pc[3]pv[4] + pc[7]pv[5] + pc[11]pv[6] + pc[15]pv[7]; v[8] -= pc[0]pv[8] + pc[4]pv[ 9] + pc[8]pv[10] + pc[12]pv[11]; v[9] -= pc[1]pv[8] + pc[5 ]pv[9] + pc[9]pv[10] + pc[13]pv[11]; v[10] -= pc[2]pv[8] + pc[6]pv[9] + pc[10]pv[10] + pc[14]pv[11]; v[11] -= pc[3] pv[8] + pc[7]pv[9] + pc[11]pv[10] + pc[15]pv[11]; v[12] -= pc[0]pv[12] + pc[4]pv[13] + pc[8]pv[14] + pc[12]pv[15]; v [13] -= pc[1]pv[12] + pc[5]pv[13] + pc[9]pv[14] + pc[13]pv [15]; v[14] -= pc[2]pv[12] + pc[6]pv[13] + pc[10]pv[14] + pc [14]pv[15]; v[15] -= pc[3]pv[12] + pc[7]pv[13] + pc[11]pv [14] + pc[15]*pv[15]; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),516,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
517	pv += bs2;
518	}
519	ierr = PetscLogFlops(128nz+112);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),519,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); / flops = 2bs^3nz + 2bs^3 - bs2) /
520	}
521	}
522
523	/* finished row so stick it into b->a */
524	/* L part */
525	pv = b->a + bs2*bi[i];
526	pj = b->j + bi[i];
527	nz = bi[i+1] - bi[i];
528	for (j=0; j<nz; j++) {
529	ierr = PetscArraycpy(pv+bs2j,rtmp+bs2pj[j],bs2)((sizeof((pv+bs2j)) != sizeof((rtmp+bs2pj[j]))) \|\| PetscMemcpy (pv+bs2j,rtmp+bs2pj[j],(bs2)sizeof((pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),529,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
530	}
531
532	/* Mark diagonal and invert diagonal for simplier triangular solves */
533	pv = b->a + bs2*bdiag[i];
534	pj = b->j + bdiag[i];
535	ierr = PetscArraycpy(pv,rtmp+bs2pj[0],bs2)((sizeof((pv)) != sizeof((rtmp+bs2pj[0]))) \|\| PetscMemcpy( pv,rtmp+bs2pj[0],(bs2)sizeof(*(pv))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),535,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
536	ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),536,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
537	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
538
539	/* U part */
540	pv = b->a + bs2*(bdiag[i+1]+1);
541	pj = b->j + bdiag[i+1]+1;
542	nz = bdiag[i] - bdiag[i+1] - 1;
543	for (j=0; j<nz; j++) {
544	ierr = PetscArraycpy(pv+bs2j,rtmp+bs2pj[j],bs2)((sizeof((pv+bs2j)) != sizeof((rtmp+bs2pj[j]))) \|\| PetscMemcpy (pv+bs2j,rtmp+bs2pj[j],(bs2)sizeof((pv+bs2*j))));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),544,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
545	}
546	}
547	ierr = PetscFree2(rtmp,mwork)PetscFreeA(2,547,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,&(rtmp),&(mwork));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),547,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
548
549	C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
550	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
551	C->assembled = PETSC_TRUE;
552
553	ierr = PetscLogFlops(1.333333333333444n);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),553,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0); /* from inverting diagonal blocks */
554	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
555	}
556
557	#if defined(PETSC_HAVE_SSE)
558
559	#include PETSC_HAVE_SSE
560
561	/* SSE Version for when blocks are 4 by 4 Using natural ordering */
562	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
563	{
564	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b = (Mat_SeqBAIJ)C->data;
565	PetscErrorCode ierr;
566	int i,j,n = a->mbs;
567	int bj = b->j,bjtmp,*pj;
568	int row;
569	int ajtmpold,nz,bi=b->i;
570	int diag_offset = b->diag,ai=a->i,*aj=a->j;
571	MatScalar pv,v,rtmp,pc,w,x;
572	MatScalar ba = b->a,aa = a->a;
573	int nonzero=0;
574	PetscBool pivotinblocks = b->pivotinblocks;
575	PetscReal shift = info->shiftamount;
576	PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
577
578	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 578; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
579	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
580	SSE_SCOPE_BEGIN;
581
582	if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),582,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." );
583	if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),583,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." );
584	ierr = PetscMalloc1(16(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,584,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16(n+1))sizeof(*(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),584,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
585	if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),585,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." );
586	/* if ((unsigned long)bj==(unsigned long)aj) { */
587	/* colscale = 4; */
588	/* } */
589	for (i=0; i<n; i++) {
590	nz = bi[i+1] - bi[i];
591	bjtmp = bj + bi[i];
592	/* zero out the 4x4 block accumulators */
593	/* zero out one register */
594	XOR_PS(XMM7,XMM7);
595	for (j=0; j<nz; j++) {
596	x = rtmp+16*bjtmp[j];
597	/* x = rtmp+4bjtmp[j]; /
598	SSE_INLINE_BEGIN_1(x)
599	/* Copy zero register to memory locations */
600	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
602	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
603	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
604	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
605	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
606	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
607	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
608	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
609	SSE_INLINE_END_1;
610	}
611	/* load in initial (unfactored row) */
612	nz = ai[i+1] - ai[i];
613	ajtmpold = aj + ai[i];
614	v = aa + 16*ai[i];
615	for (j=0; j<nz; j++) {
616	x = rtmp+16*ajtmpold[j];
617	/* x = rtmp+colscaleajtmpold[j]; /
618	/* Copy v block into x block */
619	SSE_INLINE_BEGIN_2(v,x)
620	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
623
624	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
625	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
626
627	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
628	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
629
630	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
631	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
632
633	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
634	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
635
636	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
637	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
638
639	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
640	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
641
642	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
643	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
644	SSE_INLINE_END_2;
645
646	v += 16;
647	}
648	/* row = (bjtmp++)/4; /
649	row = *bjtmp++;
650	while (row < i) {
651	pc = rtmp + 16*row;
652	SSE_INLINE_BEGIN_1(pc)
653	/* Load block from lower triangle */
654	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
657
658	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
659	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
660
661	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
662	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
663
664	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
665	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
666
667	/* Compare block to zero block */
668
669	SSE_COPY_PS(XMM4,XMM7)
670	SSE_CMPNEQ_PS(XMM4,XMM0)
671
672	SSE_COPY_PS(XMM5,XMM7)
673	SSE_CMPNEQ_PS(XMM5,XMM1)
674
675	SSE_COPY_PS(XMM6,XMM7)
676	SSE_CMPNEQ_PS(XMM6,XMM2)
677
678	SSE_CMPNEQ_PS(XMM7,XMM3)
679
680	/* Reduce the comparisons to one SSE register */
681	SSE_OR_PS(XMM6,XMM7)
682	SSE_OR_PS(XMM5,XMM4)
683	SSE_OR_PS(XMM5,XMM6)
684	SSE_INLINE_END_1;
685
686	/* Reduce the one SSE register to an integer register for branching */
687	/* Note: Since nonzero is an int, there is no INLINE block version of this call */
688	MOVEMASK(nonzero,XMM5);
689
690	/* If block is nonzero ... */
691	if (nonzero) {
692	pv = ba + 16*diag_offset[row];
693	PREFETCH_L1(&pv[16]);
694	pj = bj + diag_offset[row] + 1;
695
696	/* Form Multiplier, one column at a time (Matrix-Matrix Product) */
697	/* L_ij^(k+1) = L_ij^(k)inv(L_jj^(k)) /
698	/* but the diagonal was inverted already */
699	/* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
700
701	SSE_INLINE_BEGIN_2(pv,pc)
702	/* Column 0, product is accumulated in XMM4 */
703	SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
704	SSE_SHUFFLE(XMM4,XMM4,0x00)
705	SSE_MULT_PS(XMM4,XMM0)
706
707	SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
708	SSE_SHUFFLE(XMM5,XMM5,0x00)
709	SSE_MULT_PS(XMM5,XMM1)
710	SSE_ADD_PS(XMM4,XMM5)
711
712	SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
713	SSE_SHUFFLE(XMM6,XMM6,0x00)
714	SSE_MULT_PS(XMM6,XMM2)
715	SSE_ADD_PS(XMM4,XMM6)
716
717	SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
718	SSE_SHUFFLE(XMM7,XMM7,0x00)
719	SSE_MULT_PS(XMM7,XMM3)
720	SSE_ADD_PS(XMM4,XMM7)
721
722	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
723	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
724
725	/* Column 1, product is accumulated in XMM5 */
726	SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
727	SSE_SHUFFLE(XMM5,XMM5,0x00)
728	SSE_MULT_PS(XMM5,XMM0)
729
730	SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
731	SSE_SHUFFLE(XMM6,XMM6,0x00)
732	SSE_MULT_PS(XMM6,XMM1)
733	SSE_ADD_PS(XMM5,XMM6)
734
735	SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
736	SSE_SHUFFLE(XMM7,XMM7,0x00)
737	SSE_MULT_PS(XMM7,XMM2)
738	SSE_ADD_PS(XMM5,XMM7)
739
740	SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
741	SSE_SHUFFLE(XMM6,XMM6,0x00)
742	SSE_MULT_PS(XMM6,XMM3)
743	SSE_ADD_PS(XMM5,XMM6)
744
745	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
746	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
747
748	SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
749
750	/* Column 2, product is accumulated in XMM6 */
751	SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
752	SSE_SHUFFLE(XMM6,XMM6,0x00)
753	SSE_MULT_PS(XMM6,XMM0)
754
755	SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
756	SSE_SHUFFLE(XMM7,XMM7,0x00)
757	SSE_MULT_PS(XMM7,XMM1)
758	SSE_ADD_PS(XMM6,XMM7)
759
760	SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
761	SSE_SHUFFLE(XMM7,XMM7,0x00)
762	SSE_MULT_PS(XMM7,XMM2)
763	SSE_ADD_PS(XMM6,XMM7)
764
765	SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
766	SSE_SHUFFLE(XMM7,XMM7,0x00)
767	SSE_MULT_PS(XMM7,XMM3)
768	SSE_ADD_PS(XMM6,XMM7)
769
770	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
771	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
772
773	/* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
774	/* Column 3, product is accumulated in XMM0 */
775	SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
776	SSE_SHUFFLE(XMM7,XMM7,0x00)
777	SSE_MULT_PS(XMM0,XMM7)
778
779	SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
780	SSE_SHUFFLE(XMM7,XMM7,0x00)
781	SSE_MULT_PS(XMM1,XMM7)
782	SSE_ADD_PS(XMM0,XMM1)
783
784	SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
785	SSE_SHUFFLE(XMM1,XMM1,0x00)
786	SSE_MULT_PS(XMM1,XMM2)
787	SSE_ADD_PS(XMM0,XMM1)
788
789	SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
790	SSE_SHUFFLE(XMM7,XMM7,0x00)
791	SSE_MULT_PS(XMM3,XMM7)
792	SSE_ADD_PS(XMM0,XMM3)
793
794	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
795	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
796
797	/* Simplify Bookkeeping -- Completely Unnecessary Instructions */
798	/* This is code to be maintained and read by humans afterall. */
799	/* Copy Multiplier Col 3 into XMM3 */
800	SSE_COPY_PS(XMM3,XMM0)
801	/* Copy Multiplier Col 2 into XMM2 */
802	SSE_COPY_PS(XMM2,XMM6)
803	/* Copy Multiplier Col 1 into XMM1 */
804	SSE_COPY_PS(XMM1,XMM5)
805	/* Copy Multiplier Col 0 into XMM0 */
806	SSE_COPY_PS(XMM0,XMM4)
807	SSE_INLINE_END_2;
808
809	/* Update the row: */
810	nz = bi[row+1] - diag_offset[row] - 1;
811	pv += 16;
812	for (j=0; j<nz; j++) {
813	PREFETCH_L1(&pv[16]);
814	x = rtmp + 16*pj[j];
815	/* x = rtmp + 4pj[j]; /
816
817	/* X:=X-MPV, One column at a time /
818	/* Note: M is already loaded columnwise into registers XMM0-XMM3 */
819	SSE_INLINE_BEGIN_2(x,pv)
820	/* Load First Column of X*/
821	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
822	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
823
824	/* Matrix-Vector Product: */
825	SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
826	SSE_SHUFFLE(XMM5,XMM5,0x00)
827	SSE_MULT_PS(XMM5,XMM0)
828	SSE_SUB_PS(XMM4,XMM5)
829
830	SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
831	SSE_SHUFFLE(XMM6,XMM6,0x00)
832	SSE_MULT_PS(XMM6,XMM1)
833	SSE_SUB_PS(XMM4,XMM6)
834
835	SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
836	SSE_SHUFFLE(XMM7,XMM7,0x00)
837	SSE_MULT_PS(XMM7,XMM2)
838	SSE_SUB_PS(XMM4,XMM7)
839
840	SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
841	SSE_SHUFFLE(XMM5,XMM5,0x00)
842	SSE_MULT_PS(XMM5,XMM3)
843	SSE_SUB_PS(XMM4,XMM5)
844
845	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
846	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
847
848	/* Second Column */
849	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
850	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
851
852	/* Matrix-Vector Product: */
853	SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
854	SSE_SHUFFLE(XMM6,XMM6,0x00)
855	SSE_MULT_PS(XMM6,XMM0)
856	SSE_SUB_PS(XMM5,XMM6)
857
858	SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
859	SSE_SHUFFLE(XMM7,XMM7,0x00)
860	SSE_MULT_PS(XMM7,XMM1)
861	SSE_SUB_PS(XMM5,XMM7)
862
863	SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
864	SSE_SHUFFLE(XMM4,XMM4,0x00)
865	SSE_MULT_PS(XMM4,XMM2)
866	SSE_SUB_PS(XMM5,XMM4)
867
868	SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
869	SSE_SHUFFLE(XMM6,XMM6,0x00)
870	SSE_MULT_PS(XMM6,XMM3)
871	SSE_SUB_PS(XMM5,XMM6)
872
873	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
874	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
875
876	SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
877
878	/* Third Column */
879	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
880	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
881
882	/* Matrix-Vector Product: */
883	SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
884	SSE_SHUFFLE(XMM7,XMM7,0x00)
885	SSE_MULT_PS(XMM7,XMM0)
886	SSE_SUB_PS(XMM6,XMM7)
887
888	SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
889	SSE_SHUFFLE(XMM4,XMM4,0x00)
890	SSE_MULT_PS(XMM4,XMM1)
891	SSE_SUB_PS(XMM6,XMM4)
892
893	SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
894	SSE_SHUFFLE(XMM5,XMM5,0x00)
895	SSE_MULT_PS(XMM5,XMM2)
896	SSE_SUB_PS(XMM6,XMM5)
897
898	SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
899	SSE_SHUFFLE(XMM7,XMM7,0x00)
900	SSE_MULT_PS(XMM7,XMM3)
901	SSE_SUB_PS(XMM6,XMM7)
902
903	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
904	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
905
906	/* Fourth Column */
907	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
908	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
909
910	/* Matrix-Vector Product: */
911	SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
912	SSE_SHUFFLE(XMM5,XMM5,0x00)
913	SSE_MULT_PS(XMM5,XMM0)
914	SSE_SUB_PS(XMM4,XMM5)
915
916	SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
917	SSE_SHUFFLE(XMM6,XMM6,0x00)
918	SSE_MULT_PS(XMM6,XMM1)
919	SSE_SUB_PS(XMM4,XMM6)
920
921	SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
922	SSE_SHUFFLE(XMM7,XMM7,0x00)
923	SSE_MULT_PS(XMM7,XMM2)
924	SSE_SUB_PS(XMM4,XMM7)
925
926	SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
927	SSE_SHUFFLE(XMM5,XMM5,0x00)
928	SSE_MULT_PS(XMM5,XMM3)
929	SSE_SUB_PS(XMM4,XMM5)
930
931	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
932	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
933	SSE_INLINE_END_2;
934	pv += 16;
935	}
936	ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),936,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
937	}
938	row = *bjtmp++;
939	/* row = (bjtmp++)/4; /
940	}
941	/* finished row so stick it into b->a */
942	pv = ba + 16*bi[i];
943	pj = bj + bi[i];
944	nz = bi[i+1] - bi[i];
945
946	/* Copy x block back into pv block */
947	for (j=0; j<nz; j++) {
948	x = rtmp+16*pj[j];
949	/* x = rtmp+4pj[j]; /
950
951	SSE_INLINE_BEGIN_2(x,pv)
952	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
953	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
954	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
955
956	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
957	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
958
959	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
960	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
961
962	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
963	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
964
965	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
966	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
967
968	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
969	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
970
971	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
972	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
973
974	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
975	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
976	SSE_INLINE_END_2;
977	pv += 16;
978	}
979	/* invert diagonal block */
980	w = ba + 16*diag_offset[i];
981	if (pivotinblocks) {
982	ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),982,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
983	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
984	} else {
985	ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] = -d; w[8] = -d; w[12] = -d; w[1] = d; w[2] = d; w[3] = d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] = -d; w[9] = -d; w[13] = - d; w[4] = d; w[6] = d; w[7] = d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] = -d; w[6] = -d; w[14] = -d; w[8] = d; w[9] = d ; w[11] = d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] = -d; w[ 7] = -d; w[11] = -d; w[12] = d; w[13] = d; w[14] = d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),985,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
986	}
987	/* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
988	/* Note: Using Kramer's rule, flop count below might be infairly high or low? */
989	}
990
991	ierr = PetscFree(rtmp)((PetscTrFree)((void)(rtmp),991,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) \|\| ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),991,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
992
993	C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
994	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
995	C->assembled = PETSC_TRUE;
996
997	ierr = PetscLogFlops(1.333333333333bsbs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),997,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
998	/* Flop Count from inverting diagonal blocks */
999	SSE_SCOPE_END;
1000	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
1001	}
1002
1003	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1004	{
1005	Mat A =C;
1006	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b = (Mat_SeqBAIJ)C->data;
1007	PetscErrorCode ierr;
1008	int i,j,n = a->mbs;
1009	unsigned short bj = (unsigned short)(b->j),bjtmp,pj;
1010	unsigned short aj = (unsigned short)(a->j),*ajtmp;
1011	unsigned int row;
1012	int nz,*bi=b->i;
1013	int diag_offset = b->diag,ai=a->i;
1014	MatScalar pv,v,rtmp,pc,w,x;
1015	MatScalar ba = b->a,aa = a->a;
1016	int nonzero=0;
1017	PetscBool pivotinblocks = b->pivotinblocks;
1018	PetscReal shift = info->shiftamount;
1019	PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1020
1021	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 1021; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
1022	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
1023	SSE_SCOPE_BEGIN;
1024
1025	if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1025,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." );
1026	if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1026,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." );
1027	ierr = PetscMalloc1(16(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1027,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16(n+1))sizeof(*(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1027,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1028	if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1028,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." );
1029	/* if ((unsigned long)bj==(unsigned long)aj) { */
1030	/* colscale = 4; */
1031	/* } */
1032
1033	for (i=0; i<n; i++) {
1034	nz = bi[i+1] - bi[i];
1035	bjtmp = bj + bi[i];
1036	/* zero out the 4x4 block accumulators */
1037	/* zero out one register */
1038	XOR_PS(XMM7,XMM7);
1039	for (j=0; j<nz; j++) {
1040	x = rtmp+16*((unsigned int)bjtmp[j]);
1041	/* x = rtmp+4bjtmp[j]; /
1042	SSE_INLINE_BEGIN_1(x)
1043	/* Copy zero register to memory locations */
1044	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1045	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1046	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1047	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1048	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1049	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1050	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1051	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1052	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1053	SSE_INLINE_END_1;
1054	}
1055	/* load in initial (unfactored row) */
1056	nz = ai[i+1] - ai[i];
1057	ajtmp = aj + ai[i];
1058	v = aa + 16*ai[i];
1059	for (j=0; j<nz; j++) {
1060	x = rtmp+16*((unsigned int)ajtmp[j]);
1061	/* x = rtmp+colscaleajtmp[j]; /
1062	/* Copy v block into x block */
1063	SSE_INLINE_BEGIN_2(v,x)
1064	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1067
1068	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1069	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1070
1071	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1072	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1073
1074	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1075	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1076
1077	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1078	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1079
1080	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1081	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1082
1083	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1084	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1085
1086	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1087	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1088	SSE_INLINE_END_2;
1089
1090	v += 16;
1091	}
1092	/* row = (bjtmp++)/4; /
1093	row = (unsigned int)(*bjtmp++);
1094	while (row < i) {
1095	pc = rtmp + 16*row;
1096	SSE_INLINE_BEGIN_1(pc)
1097	/* Load block from lower triangle */
1098	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1099	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1100	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1101
1102	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1103	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1104
1105	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1106	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1107
1108	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1109	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1110
1111	/* Compare block to zero block */
1112
1113	SSE_COPY_PS(XMM4,XMM7)
1114	SSE_CMPNEQ_PS(XMM4,XMM0)
1115
1116	SSE_COPY_PS(XMM5,XMM7)
1117	SSE_CMPNEQ_PS(XMM5,XMM1)
1118
1119	SSE_COPY_PS(XMM6,XMM7)
1120	SSE_CMPNEQ_PS(XMM6,XMM2)
1121
1122	SSE_CMPNEQ_PS(XMM7,XMM3)
1123
1124	/* Reduce the comparisons to one SSE register */
1125	SSE_OR_PS(XMM6,XMM7)
1126	SSE_OR_PS(XMM5,XMM4)
1127	SSE_OR_PS(XMM5,XMM6)
1128	SSE_INLINE_END_1;
1129
1130	/* Reduce the one SSE register to an integer register for branching */
1131	/* Note: Since nonzero is an int, there is no INLINE block version of this call */
1132	MOVEMASK(nonzero,XMM5);
1133
1134	/* If block is nonzero ... */
1135	if (nonzero) {
1136	pv = ba + 16*diag_offset[row];
1137	PREFETCH_L1(&pv[16]);
1138	pj = bj + diag_offset[row] + 1;
1139
1140	/* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1141	/* L_ij^(k+1) = L_ij^(k)inv(L_jj^(k)) /
1142	/* but the diagonal was inverted already */
1143	/* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1144
1145	SSE_INLINE_BEGIN_2(pv,pc)
1146	/* Column 0, product is accumulated in XMM4 */
1147	SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1148	SSE_SHUFFLE(XMM4,XMM4,0x00)
1149	SSE_MULT_PS(XMM4,XMM0)
1150
1151	SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1152	SSE_SHUFFLE(XMM5,XMM5,0x00)
1153	SSE_MULT_PS(XMM5,XMM1)
1154	SSE_ADD_PS(XMM4,XMM5)
1155
1156	SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1157	SSE_SHUFFLE(XMM6,XMM6,0x00)
1158	SSE_MULT_PS(XMM6,XMM2)
1159	SSE_ADD_PS(XMM4,XMM6)
1160
1161	SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1162	SSE_SHUFFLE(XMM7,XMM7,0x00)
1163	SSE_MULT_PS(XMM7,XMM3)
1164	SSE_ADD_PS(XMM4,XMM7)
1165
1166	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1167	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1168
1169	/* Column 1, product is accumulated in XMM5 */
1170	SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1171	SSE_SHUFFLE(XMM5,XMM5,0x00)
1172	SSE_MULT_PS(XMM5,XMM0)
1173
1174	SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1175	SSE_SHUFFLE(XMM6,XMM6,0x00)
1176	SSE_MULT_PS(XMM6,XMM1)
1177	SSE_ADD_PS(XMM5,XMM6)
1178
1179	SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1180	SSE_SHUFFLE(XMM7,XMM7,0x00)
1181	SSE_MULT_PS(XMM7,XMM2)
1182	SSE_ADD_PS(XMM5,XMM7)
1183
1184	SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1185	SSE_SHUFFLE(XMM6,XMM6,0x00)
1186	SSE_MULT_PS(XMM6,XMM3)
1187	SSE_ADD_PS(XMM5,XMM6)
1188
1189	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1190	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1191
1192	SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1193
1194	/* Column 2, product is accumulated in XMM6 */
1195	SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1196	SSE_SHUFFLE(XMM6,XMM6,0x00)
1197	SSE_MULT_PS(XMM6,XMM0)
1198
1199	SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1200	SSE_SHUFFLE(XMM7,XMM7,0x00)
1201	SSE_MULT_PS(XMM7,XMM1)
1202	SSE_ADD_PS(XMM6,XMM7)
1203
1204	SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1205	SSE_SHUFFLE(XMM7,XMM7,0x00)
1206	SSE_MULT_PS(XMM7,XMM2)
1207	SSE_ADD_PS(XMM6,XMM7)
1208
1209	SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1210	SSE_SHUFFLE(XMM7,XMM7,0x00)
1211	SSE_MULT_PS(XMM7,XMM3)
1212	SSE_ADD_PS(XMM6,XMM7)
1213
1214	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1215	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1216
1217	/* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1218	/* Column 3, product is accumulated in XMM0 */
1219	SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1220	SSE_SHUFFLE(XMM7,XMM7,0x00)
1221	SSE_MULT_PS(XMM0,XMM7)
1222
1223	SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1224	SSE_SHUFFLE(XMM7,XMM7,0x00)
1225	SSE_MULT_PS(XMM1,XMM7)
1226	SSE_ADD_PS(XMM0,XMM1)
1227
1228	SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1229	SSE_SHUFFLE(XMM1,XMM1,0x00)
1230	SSE_MULT_PS(XMM1,XMM2)
1231	SSE_ADD_PS(XMM0,XMM1)
1232
1233	SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1234	SSE_SHUFFLE(XMM7,XMM7,0x00)
1235	SSE_MULT_PS(XMM3,XMM7)
1236	SSE_ADD_PS(XMM0,XMM3)
1237
1238	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1239	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1240
1241	/* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1242	/* This is code to be maintained and read by humans afterall. */
1243	/* Copy Multiplier Col 3 into XMM3 */
1244	SSE_COPY_PS(XMM3,XMM0)
1245	/* Copy Multiplier Col 2 into XMM2 */
1246	SSE_COPY_PS(XMM2,XMM6)
1247	/* Copy Multiplier Col 1 into XMM1 */
1248	SSE_COPY_PS(XMM1,XMM5)
1249	/* Copy Multiplier Col 0 into XMM0 */
1250	SSE_COPY_PS(XMM0,XMM4)
1251	SSE_INLINE_END_2;
1252
1253	/* Update the row: */
1254	nz = bi[row+1] - diag_offset[row] - 1;
1255	pv += 16;
1256	for (j=0; j<nz; j++) {
1257	PREFETCH_L1(&pv[16]);
1258	x = rtmp + 16*((unsigned int)pj[j]);
1259	/* x = rtmp + 4pj[j]; /
1260
1261	/* X:=X-MPV, One column at a time /
1262	/* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1263	SSE_INLINE_BEGIN_2(x,pv)
1264	/* Load First Column of X*/
1265	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1267
1268	/* Matrix-Vector Product: */
1269	SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1270	SSE_SHUFFLE(XMM5,XMM5,0x00)
1271	SSE_MULT_PS(XMM5,XMM0)
1272	SSE_SUB_PS(XMM4,XMM5)
1273
1274	SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1275	SSE_SHUFFLE(XMM6,XMM6,0x00)
1276	SSE_MULT_PS(XMM6,XMM1)
1277	SSE_SUB_PS(XMM4,XMM6)
1278
1279	SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1280	SSE_SHUFFLE(XMM7,XMM7,0x00)
1281	SSE_MULT_PS(XMM7,XMM2)
1282	SSE_SUB_PS(XMM4,XMM7)
1283
1284	SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1285	SSE_SHUFFLE(XMM5,XMM5,0x00)
1286	SSE_MULT_PS(XMM5,XMM3)
1287	SSE_SUB_PS(XMM4,XMM5)
1288
1289	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1290	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1291
1292	/* Second Column */
1293	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1295
1296	/* Matrix-Vector Product: */
1297	SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1298	SSE_SHUFFLE(XMM6,XMM6,0x00)
1299	SSE_MULT_PS(XMM6,XMM0)
1300	SSE_SUB_PS(XMM5,XMM6)
1301
1302	SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1303	SSE_SHUFFLE(XMM7,XMM7,0x00)
1304	SSE_MULT_PS(XMM7,XMM1)
1305	SSE_SUB_PS(XMM5,XMM7)
1306
1307	SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1308	SSE_SHUFFLE(XMM4,XMM4,0x00)
1309	SSE_MULT_PS(XMM4,XMM2)
1310	SSE_SUB_PS(XMM5,XMM4)
1311
1312	SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1313	SSE_SHUFFLE(XMM6,XMM6,0x00)
1314	SSE_MULT_PS(XMM6,XMM3)
1315	SSE_SUB_PS(XMM5,XMM6)
1316
1317	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1318	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1319
1320	SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1321
1322	/* Third Column */
1323	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325
1326	/* Matrix-Vector Product: */
1327	SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1328	SSE_SHUFFLE(XMM7,XMM7,0x00)
1329	SSE_MULT_PS(XMM7,XMM0)
1330	SSE_SUB_PS(XMM6,XMM7)
1331
1332	SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1333	SSE_SHUFFLE(XMM4,XMM4,0x00)
1334	SSE_MULT_PS(XMM4,XMM1)
1335	SSE_SUB_PS(XMM6,XMM4)
1336
1337	SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1338	SSE_SHUFFLE(XMM5,XMM5,0x00)
1339	SSE_MULT_PS(XMM5,XMM2)
1340	SSE_SUB_PS(XMM6,XMM5)
1341
1342	SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1343	SSE_SHUFFLE(XMM7,XMM7,0x00)
1344	SSE_MULT_PS(XMM7,XMM3)
1345	SSE_SUB_PS(XMM6,XMM7)
1346
1347	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1348	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1349
1350	/* Fourth Column */
1351	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353
1354	/* Matrix-Vector Product: */
1355	SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1356	SSE_SHUFFLE(XMM5,XMM5,0x00)
1357	SSE_MULT_PS(XMM5,XMM0)
1358	SSE_SUB_PS(XMM4,XMM5)
1359
1360	SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1361	SSE_SHUFFLE(XMM6,XMM6,0x00)
1362	SSE_MULT_PS(XMM6,XMM1)
1363	SSE_SUB_PS(XMM4,XMM6)
1364
1365	SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1366	SSE_SHUFFLE(XMM7,XMM7,0x00)
1367	SSE_MULT_PS(XMM7,XMM2)
1368	SSE_SUB_PS(XMM4,XMM7)
1369
1370	SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1371	SSE_SHUFFLE(XMM5,XMM5,0x00)
1372	SSE_MULT_PS(XMM5,XMM3)
1373	SSE_SUB_PS(XMM4,XMM5)
1374
1375	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1376	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1377	SSE_INLINE_END_2;
1378	pv += 16;
1379	}
1380	ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1380,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1381	}
1382	row = (unsigned int)(*bjtmp++);
1383	/* row = (bjtmp++)/4; /
1384	/* bjtmp++; */
1385	}
1386	/* finished row so stick it into b->a */
1387	pv = ba + 16*bi[i];
1388	pj = bj + bi[i];
1389	nz = bi[i+1] - bi[i];
1390
1391	/* Copy x block back into pv block */
1392	for (j=0; j<nz; j++) {
1393	x = rtmp+16*((unsigned int)pj[j]);
1394	/* x = rtmp+4pj[j]; /
1395
1396	SSE_INLINE_BEGIN_2(x,pv)
1397	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1398	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1399	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1400
1401	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1402	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1403
1404	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1405	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1406
1407	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1408	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1409
1410	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1411	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1412
1413	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1414	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1415
1416	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1417	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1418
1419	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1420	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1421	SSE_INLINE_END_2;
1422	pv += 16;
1423	}
1424	/* invert diagonal block */
1425	w = ba + 16*diag_offset[i];
1426	if (pivotinblocks) {
1427	ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1427,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1428	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1429	} else {
1430	ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] = -d; w[8] = -d; w[12] = -d; w[1] = d; w[2] = d; w[3] = d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] = -d; w[9] = -d; w[13] = - d; w[4] = d; w[6] = d; w[7] = d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] = -d; w[6] = -d; w[14] = -d; w[8] = d; w[9] = d ; w[11] = d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] = -d; w[ 7] = -d; w[11] = -d; w[12] = d; w[13] = d; w[14] = d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1430,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1431	}
1432	/* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1433	}
1434
1435	ierr = PetscFree(rtmp)((PetscTrFree)((void)(rtmp),1435,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) \|\| ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1435,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1436
1437	C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1438	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1439	C->assembled = PETSC_TRUE;
1440
1441	ierr = PetscLogFlops(1.333333333333bsbs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1441,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1442	/* Flop Count from inverting diagonal blocks */
1443	SSE_SCOPE_END;
1444	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
1445	}
1446
1447	PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1448	{
1449	Mat_SeqBAIJ a = (Mat_SeqBAIJ)A->data,b = (Mat_SeqBAIJ)C->data;
1450	PetscErrorCode ierr;
1451	int i,j,n = a->mbs;
1452	unsigned short bj = (unsigned short)(b->j),bjtmp,pj;
1453	unsigned int row;
1454	int ajtmpold,nz,bi=b->i;
1455	int diag_offset = b->diag,ai=a->i,*aj=a->j;
1456	MatScalar pv,v,rtmp,pc,w,x;
1457	MatScalar ba = b->a,aa = a->a;
1458	int nonzero=0;
1459	PetscBool pivotinblocks = b->pivotinblocks;
1460	PetscReal shift = info->shiftamount;
1461	PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1462
1463	PetscFunctionBegindo { do { ; if (petscstack && (petscstack->currentsize < 64)) { petscstack->function[petscstack->currentsize ] = __func__; petscstack->file[petscstack->currentsize] = "/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ; petscstack->line[petscstack->currentsize] = 1463; petscstack ->petscroutine[petscstack->currentsize] = PETSC_TRUE; petscstack ->currentsize++; } if (petscstack) { petscstack->hotdepth += (PETSC_FALSE \|\| petscstack->hotdepth); } ; } while (0) ; ; } while (0);
1464	allowzeropivot = PetscNot(A->erroriffailure)((A->erroriffailure) ? PETSC_FALSE : PETSC_TRUE);
1465	SSE_SCOPE_BEGIN;
1466
1467	if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1467,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer aa is not 16 byte aligned. SSE will not work." );
1468	if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1468,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer ba is not 16 byte aligned. SSE will not work." );
1469	ierr = PetscMalloc1(16(n+1),&rtmp)PetscMallocA(1,PETSC_FALSE,1469,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,(size_t)(16(n+1))sizeof(*(&rtmp)),(&rtmp));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1469,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1470	if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.")return PetscError(((MPI_Comm)0x44000001),1470,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,68,PETSC_ERROR_INITIAL,"Pointer rtmp is not 16 byte aligned. SSE will not work." );
1471	/* if ((unsigned long)bj==(unsigned long)aj) { */
1472	/* colscale = 4; */
1473	/* } */
1474	if ((unsigned long)bj==(unsigned long)aj) {
1475	return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1476	}
1477
1478	for (i=0; i<n; i++) {
1479	nz = bi[i+1] - bi[i];
1480	bjtmp = bj + bi[i];
1481	/* zero out the 4x4 block accumulators */
1482	/* zero out one register */
1483	XOR_PS(XMM7,XMM7);
1484	for (j=0; j<nz; j++) {
1485	x = rtmp+16*((unsigned int)bjtmp[j]);
1486	/* x = rtmp+4bjtmp[j]; /
1487	SSE_INLINE_BEGIN_1(x)
1488	/* Copy zero register to memory locations */
1489	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1490	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1491	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1492	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1493	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1494	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1495	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1496	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1497	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1498	SSE_INLINE_END_1;
1499	}
1500	/* load in initial (unfactored row) */
1501	nz = ai[i+1] - ai[i];
1502	ajtmpold = aj + ai[i];
1503	v = aa + 16*ai[i];
1504	for (j=0; j<nz; j++) {
1505	x = rtmp+16*ajtmpold[j];
1506	/* x = rtmp+colscaleajtmpold[j]; /
1507	/* Copy v block into x block */
1508	SSE_INLINE_BEGIN_2(v,x)
1509	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1510	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1511	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1512
1513	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1514	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1515
1516	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1517	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1518
1519	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1520	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1521
1522	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1523	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1524
1525	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1526	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1527
1528	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1529	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1530
1531	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1532	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1533	SSE_INLINE_END_2;
1534
1535	v += 16;
1536	}
1537	/* row = (bjtmp++)/4; /
1538	row = (unsigned int)(*bjtmp++);
1539	while (row < i) {
1540	pc = rtmp + 16*row;
1541	SSE_INLINE_BEGIN_1(pc)
1542	/* Load block from lower triangle */
1543	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1544	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1545	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1546
1547	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1548	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1549
1550	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1551	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1552
1553	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1554	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1555
1556	/* Compare block to zero block */
1557
1558	SSE_COPY_PS(XMM4,XMM7)
1559	SSE_CMPNEQ_PS(XMM4,XMM0)
1560
1561	SSE_COPY_PS(XMM5,XMM7)
1562	SSE_CMPNEQ_PS(XMM5,XMM1)
1563
1564	SSE_COPY_PS(XMM6,XMM7)
1565	SSE_CMPNEQ_PS(XMM6,XMM2)
1566
1567	SSE_CMPNEQ_PS(XMM7,XMM3)
1568
1569	/* Reduce the comparisons to one SSE register */
1570	SSE_OR_PS(XMM6,XMM7)
1571	SSE_OR_PS(XMM5,XMM4)
1572	SSE_OR_PS(XMM5,XMM6)
1573	SSE_INLINE_END_1;
1574
1575	/* Reduce the one SSE register to an integer register for branching */
1576	/* Note: Since nonzero is an int, there is no INLINE block version of this call */
1577	MOVEMASK(nonzero,XMM5);
1578
1579	/* If block is nonzero ... */
1580	if (nonzero) {
1581	pv = ba + 16*diag_offset[row];
1582	PREFETCH_L1(&pv[16]);
1583	pj = bj + diag_offset[row] + 1;
1584
1585	/* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1586	/* L_ij^(k+1) = L_ij^(k)inv(L_jj^(k)) /
1587	/* but the diagonal was inverted already */
1588	/* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1589
1590	SSE_INLINE_BEGIN_2(pv,pc)
1591	/* Column 0, product is accumulated in XMM4 */
1592	SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1593	SSE_SHUFFLE(XMM4,XMM4,0x00)
1594	SSE_MULT_PS(XMM4,XMM0)
1595
1596	SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1597	SSE_SHUFFLE(XMM5,XMM5,0x00)
1598	SSE_MULT_PS(XMM5,XMM1)
1599	SSE_ADD_PS(XMM4,XMM5)
1600
1601	SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1602	SSE_SHUFFLE(XMM6,XMM6,0x00)
1603	SSE_MULT_PS(XMM6,XMM2)
1604	SSE_ADD_PS(XMM4,XMM6)
1605
1606	SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1607	SSE_SHUFFLE(XMM7,XMM7,0x00)
1608	SSE_MULT_PS(XMM7,XMM3)
1609	SSE_ADD_PS(XMM4,XMM7)
1610
1611	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1612	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1613
1614	/* Column 1, product is accumulated in XMM5 */
1615	SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1616	SSE_SHUFFLE(XMM5,XMM5,0x00)
1617	SSE_MULT_PS(XMM5,XMM0)
1618
1619	SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1620	SSE_SHUFFLE(XMM6,XMM6,0x00)
1621	SSE_MULT_PS(XMM6,XMM1)
1622	SSE_ADD_PS(XMM5,XMM6)
1623
1624	SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1625	SSE_SHUFFLE(XMM7,XMM7,0x00)
1626	SSE_MULT_PS(XMM7,XMM2)
1627	SSE_ADD_PS(XMM5,XMM7)
1628
1629	SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1630	SSE_SHUFFLE(XMM6,XMM6,0x00)
1631	SSE_MULT_PS(XMM6,XMM3)
1632	SSE_ADD_PS(XMM5,XMM6)
1633
1634	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1635	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1636
1637	SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1638
1639	/* Column 2, product is accumulated in XMM6 */
1640	SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1641	SSE_SHUFFLE(XMM6,XMM6,0x00)
1642	SSE_MULT_PS(XMM6,XMM0)
1643
1644	SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1645	SSE_SHUFFLE(XMM7,XMM7,0x00)
1646	SSE_MULT_PS(XMM7,XMM1)
1647	SSE_ADD_PS(XMM6,XMM7)
1648
1649	SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1650	SSE_SHUFFLE(XMM7,XMM7,0x00)
1651	SSE_MULT_PS(XMM7,XMM2)
1652	SSE_ADD_PS(XMM6,XMM7)
1653
1654	SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1655	SSE_SHUFFLE(XMM7,XMM7,0x00)
1656	SSE_MULT_PS(XMM7,XMM3)
1657	SSE_ADD_PS(XMM6,XMM7)
1658
1659	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1660	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1661
1662	/* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1663	/* Column 3, product is accumulated in XMM0 */
1664	SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1665	SSE_SHUFFLE(XMM7,XMM7,0x00)
1666	SSE_MULT_PS(XMM0,XMM7)
1667
1668	SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1669	SSE_SHUFFLE(XMM7,XMM7,0x00)
1670	SSE_MULT_PS(XMM1,XMM7)
1671	SSE_ADD_PS(XMM0,XMM1)
1672
1673	SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1674	SSE_SHUFFLE(XMM1,XMM1,0x00)
1675	SSE_MULT_PS(XMM1,XMM2)
1676	SSE_ADD_PS(XMM0,XMM1)
1677
1678	SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1679	SSE_SHUFFLE(XMM7,XMM7,0x00)
1680	SSE_MULT_PS(XMM3,XMM7)
1681	SSE_ADD_PS(XMM0,XMM3)
1682
1683	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1684	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1685
1686	/* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1687	/* This is code to be maintained and read by humans afterall. */
1688	/* Copy Multiplier Col 3 into XMM3 */
1689	SSE_COPY_PS(XMM3,XMM0)
1690	/* Copy Multiplier Col 2 into XMM2 */
1691	SSE_COPY_PS(XMM2,XMM6)
1692	/* Copy Multiplier Col 1 into XMM1 */
1693	SSE_COPY_PS(XMM1,XMM5)
1694	/* Copy Multiplier Col 0 into XMM0 */
1695	SSE_COPY_PS(XMM0,XMM4)
1696	SSE_INLINE_END_2;
1697
1698	/* Update the row: */
1699	nz = bi[row+1] - diag_offset[row] - 1;
1700	pv += 16;
1701	for (j=0; j<nz; j++) {
1702	PREFETCH_L1(&pv[16]);
1703	x = rtmp + 16*((unsigned int)pj[j]);
1704	/* x = rtmp + 4pj[j]; /
1705
1706	/* X:=X-MPV, One column at a time /
1707	/* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1708	SSE_INLINE_BEGIN_2(x,pv)
1709	/* Load First Column of X*/
1710	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1712
1713	/* Matrix-Vector Product: */
1714	SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1715	SSE_SHUFFLE(XMM5,XMM5,0x00)
1716	SSE_MULT_PS(XMM5,XMM0)
1717	SSE_SUB_PS(XMM4,XMM5)
1718
1719	SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1720	SSE_SHUFFLE(XMM6,XMM6,0x00)
1721	SSE_MULT_PS(XMM6,XMM1)
1722	SSE_SUB_PS(XMM4,XMM6)
1723
1724	SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1725	SSE_SHUFFLE(XMM7,XMM7,0x00)
1726	SSE_MULT_PS(XMM7,XMM2)
1727	SSE_SUB_PS(XMM4,XMM7)
1728
1729	SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1730	SSE_SHUFFLE(XMM5,XMM5,0x00)
1731	SSE_MULT_PS(XMM5,XMM3)
1732	SSE_SUB_PS(XMM4,XMM5)
1733
1734	SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1735	SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1736
1737	/* Second Column */
1738	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1740
1741	/* Matrix-Vector Product: */
1742	SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1743	SSE_SHUFFLE(XMM6,XMM6,0x00)
1744	SSE_MULT_PS(XMM6,XMM0)
1745	SSE_SUB_PS(XMM5,XMM6)
1746
1747	SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1748	SSE_SHUFFLE(XMM7,XMM7,0x00)
1749	SSE_MULT_PS(XMM7,XMM1)
1750	SSE_SUB_PS(XMM5,XMM7)
1751
1752	SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1753	SSE_SHUFFLE(XMM4,XMM4,0x00)
1754	SSE_MULT_PS(XMM4,XMM2)
1755	SSE_SUB_PS(XMM5,XMM4)
1756
1757	SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1758	SSE_SHUFFLE(XMM6,XMM6,0x00)
1759	SSE_MULT_PS(XMM6,XMM3)
1760	SSE_SUB_PS(XMM5,XMM6)
1761
1762	SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1763	SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1764
1765	SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1766
1767	/* Third Column */
1768	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1770
1771	/* Matrix-Vector Product: */
1772	SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1773	SSE_SHUFFLE(XMM7,XMM7,0x00)
1774	SSE_MULT_PS(XMM7,XMM0)
1775	SSE_SUB_PS(XMM6,XMM7)
1776
1777	SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1778	SSE_SHUFFLE(XMM4,XMM4,0x00)
1779	SSE_MULT_PS(XMM4,XMM1)
1780	SSE_SUB_PS(XMM6,XMM4)
1781
1782	SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1783	SSE_SHUFFLE(XMM5,XMM5,0x00)
1784	SSE_MULT_PS(XMM5,XMM2)
1785	SSE_SUB_PS(XMM6,XMM5)
1786
1787	SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1788	SSE_SHUFFLE(XMM7,XMM7,0x00)
1789	SSE_MULT_PS(XMM7,XMM3)
1790	SSE_SUB_PS(XMM6,XMM7)
1791
1792	SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1793	SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1794
1795	/* Fourth Column */
1796	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1798
1799	/* Matrix-Vector Product: */
1800	SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1801	SSE_SHUFFLE(XMM5,XMM5,0x00)
1802	SSE_MULT_PS(XMM5,XMM0)
1803	SSE_SUB_PS(XMM4,XMM5)
1804
1805	SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1806	SSE_SHUFFLE(XMM6,XMM6,0x00)
1807	SSE_MULT_PS(XMM6,XMM1)
1808	SSE_SUB_PS(XMM4,XMM6)
1809
1810	SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1811	SSE_SHUFFLE(XMM7,XMM7,0x00)
1812	SSE_MULT_PS(XMM7,XMM2)
1813	SSE_SUB_PS(XMM4,XMM7)
1814
1815	SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1816	SSE_SHUFFLE(XMM5,XMM5,0x00)
1817	SSE_MULT_PS(XMM5,XMM3)
1818	SSE_SUB_PS(XMM4,XMM5)
1819
1820	SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1821	SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1822	SSE_INLINE_END_2;
1823	pv += 16;
1824	}
1825	ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1825,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1826	}
1827	row = (unsigned int)(*bjtmp++);
1828	/* row = (bjtmp++)/4; /
1829	/* bjtmp++; */
1830	}
1831	/* finished row so stick it into b->a */
1832	pv = ba + 16*bi[i];
1833	pj = bj + bi[i];
1834	nz = bi[i+1] - bi[i];
1835
1836	/* Copy x block back into pv block */
1837	for (j=0; j<nz; j++) {
1838	x = rtmp+16*((unsigned int)pj[j]);
1839	/* x = rtmp+4pj[j]; /
1840
1841	SSE_INLINE_BEGIN_2(x,pv)
1842	/* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1843	SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1844	SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1845
1846	SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1847	SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1848
1849	SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1850	SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1851
1852	SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1853	SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1854
1855	SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1856	SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1857
1858	SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1859	SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1860
1861	SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1862	SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1863
1864	SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1865	SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1866	SSE_INLINE_END_2;
1867	pv += 16;
1868	}
1869	/* invert diagonal block */
1870	w = ba + 16*diag_offset[i];
1871	if (pivotinblocks) {
1872	ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1872,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1873	if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1874	} else {
1875	ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w)0; { MatScalar d, di; di = w[0]; w[0] = d = 1.0 / di; w[4] = -d; w[8] = -d; w[12] = -d; w[1] = d; w[2] = d; w[3] = d ; w[5] += w[4] * w[1] * di; w[6] += w[4] * w[2] * di; w[7] += w[4] * w[3] * di; w[9] += w[8] * w[1] * di; w[10] += w[8] * w [2] * di; w[11] += w[8] * w[3] * di; w[13] += w[12] * w[1] * di ; w[14] += w[12] * w[2] * di; w[15] += w[12] * w[3] * di; di = w[5]; w[5] = d = 1.0 / di; w[1] = -d; w[9] = -d; w[13] = - d; w[4] = d; w[6] = d; w[7] = d; w[0] += w[1] * w[4] * di; w[2] += w[1] * w[6] * di; w[3] += w[1] * w[7] * di; w[8] += w [9] * w[4] * di; w[10] += w[9] * w[6] * di; w[11] += w[9] * w [7] * di; w[12] += w[13] * w[4] * di; w[14] += w[13] * w[6] * di; w[15] += w[13] * w[7] * di; di = w[10]; w[10] = d = 1.0 / di; w[2] = -d; w[6] = -d; w[14] = -d; w[8] = d; w[9] = d ; w[11] = d; w[0] += w[2] * w[8] * di; w[1] += w[2] * w[9] * di; w[3] += w[2] * w[11] * di; w[4] += w[6] * w[8] * di; w[5 ] += w[6] * w[9] * di; w[7] += w[6] * w[11] * di; w[12] += w[ 14] * w[8] * di; w[13] += w[14] * w[9] * di; w[15] += w[14] * w[11] * di; di = w[15]; w[15] = d = 1.0 / di; w[3] = -d; w[ 7] = -d; w[11] = -d; w[12] = d; w[13] = d; w[14] = d; w[ 0] += w[3] * w[12] * di; w[1] += w[3] * w[13] * di; w[2] += w [3] * w[14] * di; w[4] += w[7] * w[12] * di; w[5] += w[7] * w [13] * di; w[6] += w[7] * w[14] * di; w[8] += w[11] * w[12] * di; w[9] += w[11] * w[13] * di; w[10] += w[11] * w[14] * di; };CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1875,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1876	}
1877	/* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1878	/* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1879	}
1880
1881	ierr = PetscFree(rtmp)((PetscTrFree)((void)(rtmp),1881,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ) \|\| ((rtmp) = 0,0));CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1881,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1882
1883	C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1884	C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1885	C->assembled = PETSC_TRUE;
1886
1887	ierr = PetscLogFlops(1.333333333333bsbs2*b->mbs);CHKERRQ(ierr)do {if (__builtin_expect(!!(ierr),0)) return PetscError(((MPI_Comm )0x44000001),1887,__func__,"/sandbox/petsc/petsc.next/src/mat/impls/baij/seq/baijfact11.c" ,ierr,PETSC_ERROR_REPEAT," ");} while (0);
1888	/* Flop Count from inverting diagonal blocks */
1889	SSE_SCOPE_END;
1890	PetscFunctionReturn(0)do { do { ; if (petscstack && petscstack->currentsize > 0) { petscstack->currentsize--; petscstack->function [petscstack->currentsize] = 0; petscstack->file[petscstack ->currentsize] = 0; petscstack->line[petscstack->currentsize ] = 0; petscstack->petscroutine[petscstack->currentsize ] = PETSC_FALSE; } if (petscstack) { petscstack->hotdepth = (((petscstack->hotdepth-1)<(0)) ? (0) : (petscstack-> hotdepth-1)); } ; } while (0); return(0);} while (0);
1891	}
1892
1893	#endif