Actual source code: fsolvebaij.F

  1: !
  2: !  "$Id: fsolvebaij.F,v 1.9 2001/08/07 03:05:24 balay Exp $";
  3: !
  4: !    Fortran kernel for sparse triangular solve in the BAIJ matrix format
  5: ! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
  6: ! with MatSolve_SeqBAIJ_4_NaturalOrdering()
  7: !
 8:  #include include/finclude/petscdef.h
  9: !
 10:       subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
 11:       implicit none
 12:       MatScalar        a(0:*)
 13:       PetscScalar      x(0:*),b(0:*)
 14:       integer          n,ai(0:*),aj(0:*),adiag(0:*)

 16:       integer          i,j,jstart,jend,idx,ax,jdx
 17:       PetscScalar      s1,s2,s3,s4
 18:       PetscScalar      x1,x2,x3,x4
 19: !
 20: !     Forward Solve
 21: !

 23:       x(0) = b(0)
 24:       x(1) = b(1)
 25:       x(2) = b(2)
 26:       x(3) = b(3)
 27:       idx  = 0
 28:       do 20 i=1,n-1
 29:          jstart = ai(i)
 30:          jend   = adiag(i) - 1
 31:          ax    = 16*jstart
 32:          idx    = idx + 4
 33:          s1     = b(idx)
 34:          s2     = b(idx+1)
 35:          s3     = b(idx+2)
 36:          s4     = b(idx+3)
 37:          do 30 j=jstart,jend
 38:            jdx   = 4*aj(j)
 39: 
 40:            x1    = x(jdx)
 41:            x2    = x(jdx+1)
 42:            x3    = x(jdx+2)
 43:            x4    = x(jdx+3)
 44:            s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
 45:            s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
 46:            s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
 47:            s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
 48:            ax = ax + 16
 49:  30      continue
 50:          x(idx)   = s1
 51:          x(idx+1) = s2
 52:          x(idx+2) = s3
 53:          x(idx+3) = s4
 54:  20   continue
 55: 
 56: !
 57: !     Backward solve the upper triangular
 58: !
 59:       do 40 i=n-1,0,-1
 60:          jstart  = adiag(i) + 1
 61:          jend    = ai(i+1) - 1
 62:          ax     = 16*jstart
 63:          s1      = x(idx)
 64:          s2      = x(idx+1)
 65:          s3      = x(idx+2)
 66:          s4      = x(idx+3)
 67:          do 50 j=jstart,jend
 68:            jdx   = 4*aj(j)
 69:            x1    = x(jdx)
 70:            x2    = x(jdx+1)
 71:            x3    = x(jdx+2)
 72:            x4    = x(jdx+3)
 73:            s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
 74:            s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
 75:            s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
 76:            s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
 77:            ax = ax + 16
 78:  50      continue
 79:          ax      = 16*adiag(i)
 80:          x(idx)   = a(ax)*s1  +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
 81:          x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
 82:          x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
 83:          x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
 84:          idx      = idx - 4
 85:  40   continue
 86:       return
 87:       end
 88: 
 89: !
 90: !   version that calls BLAS 2 operation for each row block
 91: !
 92:       subroutine FortranSolveBAIJ4BLAS(n,x,ai,aj,adiag,a,b,w)
 93:       implicit none
 94:       MatScalar        a(0:*)
 95:       PetscScalar      x(0:*),b(0:*),w(0:*)
 96:       integer          n,ai(0:*),aj(0:*),adiag(0:*)

 98:       integer          i,j,jstart,jend,idx,ax,jdx,kdx
 99:       PetscScalar      s(0:3)
100: !
101: !     Forward Solve
102: !

104:       x(0) = b(0)
105:       x(1) = b(1)
106:       x(2) = b(2)
107:       x(3) = b(3)
108:       idx  = 0
109:       do 20 i=1,n-1
110: !
111: !        Pack required part of vector into work array
112: !
113:          kdx    = 0
114:          jstart = ai(i)
115:          jend   = adiag(i) - 1
116:          if (jend - jstart .ge. 500) then
117:            write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
118:          endif
119:          do 30 j=jstart,jend
120: 
121:            jdx       = 4*aj(j)
122: 
123:            w(kdx)    = x(jdx)
124:            w(kdx+1)  = x(jdx+1)
125:            w(kdx+2)  = x(jdx+2)
126:            w(kdx+3)  = x(jdx+3)
127:            kdx       = kdx + 4
128:  30      continue

130:          ax      = 16*jstart
131:          idx      = idx + 4
132:          s(0)     = b(idx)
133:          s(1)     = b(idx+1)
134:          s(2)     = b(idx+2)
135:          s(3)     = b(idx+3)
136: !
137: !    s = s - a(ax:)*w
138: !
139:          call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)

141:          x(idx)   = s(0)
142:          x(idx+1) = s(1)
143:          x(idx+2) = s(2)
144:          x(idx+3) = s(3)
145:  20   continue
146: 
147: !
148: !     Backward solve the upper triangular
149: !
150:       do 40 i=n-1,0,-1
151:          jstart    = adiag(i) + 1
152:          jend      = ai(i+1) - 1
153:          ax       = 16*jstart
154:          s(0)      = x(idx)
155:          s(1)      = x(idx+1)
156:          s(2)      = x(idx+2)
157:          s(3)      = x(idx+3)
158: !
159: !   Pack each chunk of vector needed
160: !
161:          kdx = 0
162:          if (jend - jstart .ge. 500) then
163:            write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
164:          endif
165:          do 50 j=jstart,jend
166:            jdx      = 4*aj(j)
167:            w(kdx)   = x(jdx)
168:            w(kdx+1) = x(jdx+1)
169:            w(kdx+2) = x(jdx+2)
170:            w(kdx+3) = x(jdx+3)
171:            kdx      = kdx + 4
172:  50      continue
173:          call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)

175:          ax      = 16*adiag(i)
176:          x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
177:          x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
178:          x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
179:          x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
180:          idx     = idx - 4
181:  40   continue
182:       return
183:       end
184: 

186: !
187: !   version that does not call BLAS 2 operation for each row block
188: !
189:       subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
190:       implicit none
191:       MatScalar        a(0:*)
192:       PetscScalar      x(0:*),b(0:*),w(0:*)
193:       integer          n,ai(0:*),aj(0:*),adiag(0:*),ii,jj

195:       integer          i,j,jstart,jend,idx,ax,jdx,kdx,nn
196:       PetscScalar      s(0:3)
197: !
198: !     Forward Solve
199: !

201:       x(0) = b(0)
202:       x(1) = b(1)
203:       x(2) = b(2)
204:       x(3) = b(3)
205:       idx  = 0
206:       do 20 i=1,n-1
207: !
208: !        Pack required part of vector into work array
209: !
210:          kdx    = 0
211:          jstart = ai(i)
212:          jend   = adiag(i) - 1
213:          if (jend - jstart .ge. 500) then
214:            write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
215:          endif
216:          do 30 j=jstart,jend
217: 
218:            jdx       = 4*aj(j)
219: 
220:            w(kdx)    = x(jdx)
221:            w(kdx+1)  = x(jdx+1)
222:            w(kdx+2)  = x(jdx+2)
223:            w(kdx+3)  = x(jdx+3)
224:            kdx       = kdx + 4
225:  30      continue

227:          ax       = 16*jstart
228:          idx      = idx + 4
229:          s(0)     = b(idx)
230:          s(1)     = b(idx+1)
231:          s(2)     = b(idx+2)
232:          s(3)     = b(idx+3)
233: !
234: !    s = s - a(ax:)*w
235: !
236:          nn = 4*(jend - jstart + 1) - 1
237:          do 100, ii=0,3
238:            do 110, jj=0,nn
239:              s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
240:  110       continue
241:  100     continue

243:          x(idx)   = s(0)
244:          x(idx+1) = s(1)
245:          x(idx+2) = s(2)
246:          x(idx+3) = s(3)
247:  20   continue
248: 
249: !
250: !     Backward solve the upper triangular
251: !
252:       do 40 i=n-1,0,-1
253:          jstart    = adiag(i) + 1
254:          jend      = ai(i+1) - 1
255:          ax        = 16*jstart
256:          s(0)      = x(idx)
257:          s(1)      = x(idx+1)
258:          s(2)      = x(idx+2)
259:          s(3)      = x(idx+3)
260: !
261: !   Pack each chunk of vector needed
262: !
263:          kdx = 0
264:          if (jend - jstart .ge. 500) then
265:            write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
266:          endif
267:          do 50 j=jstart,jend
268:            jdx      = 4*aj(j)
269:            w(kdx)   = x(jdx)
270:            w(kdx+1) = x(jdx+1)
271:            w(kdx+2) = x(jdx+2)
272:            w(kdx+3) = x(jdx+3)
273:            kdx      = kdx + 4
274:  50      continue
275:          nn = 4*(jend - jstart + 1) - 1
276:          do 200, ii=0,3
277:            do 210, jj=0,nn
278:              s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
279:  210       continue
280:  200     continue

282:          ax      = 16*adiag(i)
283:          x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
284:          x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
285:          x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
286:          x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
287:          idx     = idx - 4
288:  40   continue
289:       return
290:       end
291: