#include "petscda.h"
#include "petscsnes.h"
#include "ad_deriv.h"

extern int ad_localfunction2d(DERIV_TYPE **,DERIV_TYPE **,int,int,int,int,int,int,void *);

/* ------------------------------------------------------------------- */
#undef __FUNCT__
#define __FUNCT__ "FormJacobian"
/*
   FormJacobian - Evaluates Jacobian matrix.

   Input Parameters:
.  snes - the SNES context
.  x - input vector
.  ptr - optional user-defined context, as set by SNESSetJacobian()

   Output Parameters:
.  A - Jacobian matrix
.  B - optionally different preconditioning matrix
.  flag - flag indicating matrix structure

   Notes:
   Due to grid point reordering with DAs, we must always work
   with the local grid points, and then transform them to the new
   global numbering with the local-to-global mapping.  We cannot work
   directly with the global numbers for the original uniprocessor grid!  

   Two methods are available for imposing this transformation
   when setting matrix entries:
     (A) MatSetValuesLocal(), using the local ordering (including
         ghost points!)
         - Do the following two steps once, before calling SNESSolve()
           - Use DAGetISLocalToGlobalMapping() to extract the
             local-to-global map from the DA
           - Associate this map with the matrix by calling
             MatSetLocalToGlobalMapping() 
         - Then set matrix entries using the local ordering
           by calling MatSetValuesLocal()
     (B) MatSetValues(), using the global ordering 
         - Use DAGetGlobalIndices() to extract the local-to-global map
         - Then apply this map explicitly yourself
         - Set matrix entries using the global ordering by calling
           MatSetValues()
   Option (A) seems cleaner/easier in many cases, and is the procedure
   used in this example.

   This is the version that uses Sparse AD via coloring.
   Assumptions:
      -#dims = 2
      -stencil width = 1
      -stencil type = DA_STENCIL_STAR
      -periodicity = DA_NONPERIODIC
      -col[], av[] are big enough
   For now, focus is on correctness rather than efficiency
      -mallocs and frees on each call
      -recomputation of permutation on each call
      -conditionals in loops during assembly
*/
int FormJacobian(SNES snes,Vec X,Mat *J,Mat *B,MatStructure *flag,void *ptr)
{
  DMMG    dmmg = (DMMG) ptr;
  int     ierr,i,j,k,l,row,col[25];
  int     xs,ys,xm,ym;
  int     gxs,gys,gxm,gym;
  int     mx,my;
  Vec     localX;
  Scalar  **x;
  Mat     jac = *B;                /* Jacobian matrix */

  DERIV_TYPE **ad_x, **ad_f, *ad_ptr, dummy;
  int color, permutation[5][5];
  Scalar av[25];
  int dim,dof,stencil_size,stencil_width;
  DAStencilType stencil_type;
  DAPeriodicType periodicity;
  int procid;

  DA      da= (DA) dmmg->dm;

  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&procid);

  ierr = DAGetLocalVector(da,&localX);CHKERRQ(ierr);
  /*
     Scatter ghost points to local vector, using the 2-step process
        DAGlobalToLocalBegin(), DAGlobalToLocalEnd().
     By placing code between these two statements, computations can be
     done while messages are in transition.
  */
  ierr = DAGlobalToLocalBegin(da,X,INSERT_VALUES,localX);CHKERRQ(ierr);
  ierr = DAGlobalToLocalEnd(da,X,INSERT_VALUES,localX);CHKERRQ(ierr);

  ierr = DAGetInfo(da,&dim,&mx,&my,0,0,0,0,&dof,&stencil_width,&periodicity,&stencil_type); CHKERRQ(ierr);

  /* Verify that this DA type is supported */
  if ((dim != 2) || (stencil_width != 1) || (stencil_type != DA_STENCIL_STAR)
      || (periodicity != DA_NONPERIODIC)) {
    SETERRQ(0,"This DA type is not yet supported. Sorry.\n");
  }

  stencil_size = 5;

  /*
     Get local grid boundaries
  */
  ierr = DAGetCorners(da,&xs,&ys,PETSC_NULL,&xm,&ym,PETSC_NULL);CHKERRQ(ierr);
  ierr = DAGetGhostCorners(da,&gxs,&gys,PETSC_NULL,&gxm,&gym,PETSC_NULL);CHKERRQ(ierr);

  /*  printf("myid = %d x[%d,%d], y[%d,%d], gx[%d,%d], gy[%d,%d]\n",myid); */



  /*
     Get pointer to vector data
  */
  ierr = DAVecGetArray(da,localX,(void**)&x);CHKERRQ(ierr);

  /* allocate space for derivative objects.  Should be able to use something 
     like:

      ierr = DAVecGetStructArray(da,localX,(void **)&ad_x,sizeof(DERIV_TYPE));CHKERRQ(ierr);
      ierr = DAVecGetStructArray(da,F,(void **)&ad_f,sizeof(DERIV_TYPE));CHKERRQ(ierr);

      instead. */

  ierr = PetscMalloc((gym)*sizeof(DERIV_TYPE *),(void **)&ad_x); CHKERRQ(ierr);
  ad_x -= gys;
  for(j=gys;j<gys+gym;j++) {
    ierr = PetscMalloc((dof*gxm)*sizeof(DERIV_TYPE),(void **)&ad_ptr); CHKERRQ(ierr);
    ad_x[j] = ad_ptr - dof*gxs;
    for(i=dof*gxs;i<dof*(gxs+gxm);i++) DERIV_val(ad_x[j][i]) = x[j][i];
  }

  ierr = PetscMalloc((ym)*sizeof(DERIV_TYPE *),(void**)&ad_f); CHKERRQ(ierr); 
  ad_f -= ys; 
  for(j=ys;j<ys+ym;j++) {
    ierr = PetscMalloc((dof*xm)*sizeof(DERIV_TYPE),(void**)&ad_ptr);  CHKERRQ(ierr);
    ad_f[j] = ad_ptr - dof*xs;
  }

  ad_AD_Init(stencil_size*dof);

  /* Fake ADIC into thinking there are stencil_size*dof independent variables - fix this */
  for(i=0;i<stencil_size*dof;i++) ad_AD_SetIndep(dummy);
  ad_AD_SetIndepDone();

  /* Could use PETSc facilities for coloring, but these are column
     oriented, to facilitate finite differencing; since AD can compute
     all columns at once, it seems better to be a bit more row oriented */

  /* Initialize permutation array - could be done once, outside this routine*/

  /* This is specific to STENCIL_STAR with width 1 */
  for(color=0;color<5;color++) {
    /* % doesn't seem to behave as it should for negative numbers, so
       we add 2 instead of subtracting 3 and add 4 instead of
       subtracting 1 */
    permutation[color][0] = (color + 2) % 5; /* South */
    permutation[color][1] = (color + 4) % 5; /* West */
    permutation[color][2] = color;           /* Self */
    permutation[color][3] = (color + 1) % 5; /* East */
    permutation[color][4] = (color + 3) % 5; /* North */
  }


  /* Initialize seed matrix, using coloring */

  for(j=gys;j<gys+gym;j++) {
    for(i=gxs;i<gxs+gxm;i++) {
      color = (3*j+i) % 5; /* specific to STENCIL_STAR with width 1 */ 
      for(k=0;k<dof;k++) {
	ad_AD_ClearGrad(DERIV_grad(ad_x[j][i*dof+k]));
	DERIV_grad(ad_x[j][i*dof+k])[color*dof+k] = 1.0;
      }
    }
  }

  /* 
     Compute entries for the locally owned part of the Jacobian.
      - Currently, all PETSc parallel matrix formats are partitioned by
        contiguous chunks of rows across the processors. 
      - Each processor needs to insert only elements that it owns
        locally (but any non-local elements will be sent to the
        appropriate processor during matrix assembly). 
      - Here, we set all entries for a particular row at once.
      - We can set matrix entries either using either
        MatSetValuesLocal() or MatSetValues(), as discussed above.
  */

  ierr = ad_localfunction2d(ad_x,ad_f,xs,xm,ys,ym,mx,my,dmmg->user); CHKERRQ(ierr);

  /* Assemble Jacobian - assumes star stencil of width 1
     Could probably be simplified by using Block operations */

  /* might also be simpler if introduced a loop over dof and repalce row with 
     vertex, row=vertex+idof */

  for (j=ys; j<ys+ym; j++) {
    row = (j - gys)*gxm*dof + (xs - gxs)*dof - 1;
    for (i=dof*xs; i<dof*(xs+xm); i++) {
      row++;
      /* Extract row of AD Jacobian and permute */
      color = (3*j+i/dof) % 5;
      /* printf("Proc %d Row %d:",procid,row); */
      for(k=0;k<5;k++) {
	for(l=0;l<dof;l++) {
	  av[k*dof+l] = DERIV_grad(ad_f[j][i])[permutation[color][k]*dof+l];
	  /* printf("%g ",DERIV_grad(ad_f[j][i])[permutation[color][k]*dof+l]); */
	}
      }
      /* printf("\n"); */
      for(k=0;k<dof;k++){
	col[0*dof+k] = (row/dof - gxm)*dof + k;
	col[1*dof+k] = (row/dof - 1)*dof + k;
	col[2*dof+k] = (row/dof)*dof + k;
	col[3*dof+k] = (row/dof + 1)*dof + k;
	col[4*dof+k] = (row/dof + gxm)*dof + k;
	/* Tell PETSc not to insert a value if we're at a boundary */
        if (j==0)        col[0*dof+k] = -1; /* Boundary: no south neighbor */ 
	if (i/dof==0)    col[1*dof+k] = -1; /* Boundary: no west neighbor */
	if (i/dof==mx-1) col[3*dof+k] = -1; /* Boundary: no east neighbor */
	if (j==my-1)     col[4*dof+k] = -1; /* Boundary: no north neighbor */ 
      }
      /* Insert a row */
      ierr = MatSetValuesLocal(jac,1,&row,stencil_size*dof,col,av,INSERT_VALUES);CHKERRQ(ierr);
    }
  }
 
  /* Could have actually done this right after ad_x was initialized */

  ierr = DAVecRestoreArray(da,localX,(void**)&x);CHKERRQ(ierr);
  ierr = DARestoreLocalVector(da,&localX);CHKERRQ(ierr);

  /* Free DERIV_TYPE arrays - again, this should be a Restore-type operation*/

  for(j=gys;j<gys+gym;j++) {
    ierr = PetscFree(ad_x[j]+dof*gxs); CHKERRQ(ierr);
  }
  ierr = PetscFree(ad_x + gys); CHKERRQ(ierr);

  for(j=ys;j<ys+ym;j++) {
    ierr = PetscFree(ad_f[j]+dof*xs); CHKERRQ(ierr);
  }
  ierr = PetscFree(ad_f + ys); CHKERRQ(ierr);

  /* 
     Assemble matrix, using the 2-step process:
       MatAssemblyBegin(), MatAssemblyEnd().
  */
  ierr = MatAssemblyBegin(jac,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(jac,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  /*
     Normally since the matrix has already been assembled above; this
     would do nothing. But in the matrix free mode -snes_mf_operator
     this tells the "matrix-free" matrix that a new linear system solve
     is about to be done.
  */

  ierr = MatAssemblyBegin(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
  ierr = MatAssemblyEnd(*J,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);

  /*
     Set flag to indicate that the Jacobian matrix retains an identical
     nonzero structure throughout all nonlinear iterations (although the
     values of the entries change). Thus, we can save some work in setting
     up the preconditioner (e.g., no need to redo symbolic factorization for
     ILU/ICC preconditioners).
      - If the nonzero structure of the matrix is different during
        successive linear solves, then the flag DIFFERENT_NONZERO_PATTERN
        must be used instead.  If you are unsure whether the matrix
        structure has changed or not, use the flag DIFFERENT_NONZERO_PATTERN.
      - Caution:  If you specify SAME_NONZERO_PATTERN, PETSc
        believes your assertion and does not check the structure
        of the matrix.  If you erroneously claim that the structure
        is the same when it actually is not, the new preconditioner
        will not function correctly.  Thus, use this optimization
        feature with caution!
  */
  *flag = SAME_NONZERO_PATTERN;


  /*
     Tell the matrix we will never add a new nonzero location to the
     matrix. If we do, it will generate an error.
  */
  ierr = MatSetOption(jac,MAT_NEW_NONZERO_LOCATION_ERR);CHKERRQ(ierr);
  return 0;
}

