Actual source code: ex52_integrateElementOpenCL.c

petsc-dev 2014-02-02
Report Typos and Errors
  1: #include <petscsys.h>
  2: #include <stdlib.h>

  4: #ifdef __APPLE__
  5: #include <OpenCL/cl.h>
  6: #else
  7: #include <CL/cl.h>
  8: #endif

 10: //#define SPATIAL_DIM_0 2

 12: typedef enum {LAPLACIAN = 0, ELASTICITY} OpType;

 14: /* Put the OpenCL program into a source string.
 15:  * This allows to generate all the code at runtime, no need for external Python magic as for CUDA
 16:  *
 17:  * The code uses snprintf() to concatenate strings, as this is safer than strcat().
 18:  */
 21: PetscErrorCode generateOpenCLSource(char **string_buffer, PetscInt buffer_length, PetscInt spatial_dim, PetscInt N_bl, PetscInt pde_op)
 22: {
 23:   char            *string_tail   = *string_buffer;
 24:   char            *end_of_buffer = *string_buffer + buffer_length;
 25:   PetscInt        num_quadrature_points = 1;
 26:   PetscInt        num_basis_components = (pde_op == LAPLACIAN) ? 1 : spatial_dim;
 27:   PetscInt        num_basis_functions = 3;
 28:   PetscInt        num_threads = num_basis_functions * num_basis_components * num_quadrature_points * N_bl; /* N_t */

 30: /* dim     Number of spatial dimensions:          2                   */
 31: /* N_b     Number of basis functions:             generated           */
 32: /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
 33: /* N_q     Number of quadrature points:           generated           */
 34: /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
 35: /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
 36: /* N_{bl}  Number of concurrent blocks            generated           */
 37: /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
 38: /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
 39: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
 40: /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
 41: /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
 42: /* N_{cb}  Number of serial cell batches:         input               */
 43: /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */

 45: #define STRING_ERROR_CHECK(MSG) \
 46:   if (string_tail == end_of_buffer) {\
 47:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);\
 48:   }

 51:   char float_str[] = "float";
 52:   char double_str[] = "double";
 53:   char *numeric_str = &(float_str[0]);

 55:   /* Enable device extension for double precision */
 56:   if (sizeof(PetscReal) == sizeof(double)) {
 57:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
 58: "#if defined(cl_khr_fp64)\n"
 59: "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
 60: "#elif defined(cl_amd_fp64)\n"
 61: "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
 62: "#endif\n");
 63:     numeric_str = &(double_str[0]);
 64:   }

 66:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
 67: "\n"
 68: "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
 69: "{\n", numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");

 71:   if (spatial_dim == 2) {
 72:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
 73: "  const int numQuadraturePoints_0 = %d;\n"
 74: "\n"
 75: "  /* Quadrature points\n"
 76: "   - (x1,y1,x2,y2,...) */\n"
 77: "  const %s points_0[2] = {\n"
 78: "    -0.333333333333,\n"
 79: "    -0.333333333333};\n"
 80: "\n"
 81: "  /* Quadrature weights\n"
 82: "   - (v1,v2,...) */\n"
 83: "  const %s weights_0[1] = {2.0};\n"
 84: "\n"
 85: "  const int numBasisFunctions_0 = %d;\n"
 86: "  const int numBasisComponents_0 = %d;\n", num_quadrature_points, numeric_str, numeric_str, num_basis_functions, num_basis_components);STRING_ERROR_CHECK("Message to short");

 88:     if (pde_op == LAPLACIAN) {
 89:       string_tail += snprintf(string_tail, end_of_buffer - string_tail,
 90: "\n"
 91: "  /* Nodal basis function evaluations\n"
 92: "    - basis function is fastest varying, then point */\n"
 93: "  const %s Basis_0[3] = {\n"
 94: "    0.333333333333,\n"
 95: "    0.333333333333,\n"
 96: "    0.333333333333};\n"
 97: "\n"
 98: "  /* Nodal basis function derivative evaluations,\n"
 99: "      - derivative direction fastest varying, then basis function, then point */\n"
100: "  const %s2 BasisDerivatives_0[3] = {\n"
101: "    (%s2)(-0.5, -0.5),\n"
102: "    (%s2)(0.5, 0.0),\n"
103: "    (%s2)(0.0, 0.5)};\n"
104: "\n", numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
105:     } else if (pde_op == ELASTICITY) {
106:       string_tail += snprintf(string_tail, end_of_buffer - string_tail,
107: "\n"
108: "  /* Nodal basis function evaluations\n"
109: "    - basis function is fastest varying, then point */\n"
110: "  const %s Basis_0[6] = {\n"
111: "    0.333333333333,\n"
112: "    0.333333333333,\n"
113: "    0.333333333333,\n"
114: "    0.333333333333,\n"
115: "    0.333333333333,\n"
116: "    0.333333333333};\n"
117: "\n"
118: "  /* Nodal basis function derivative evaluations,\n"
119: "      - derivative direction fastest varying, then basis function, then point */\n"
120: "  const %s2 BasisDerivatives_0[6] = {\n"
121: "    (%s2)(-0.5, -0.5),\n"
122: "    (%s2)(-0.5, -0.5),\n"
123: "    (%s2)(0.5, 0.0),\n"
124: "    (%s2)(0.5, 0.0),\n"
125: "    (%s2)(0.0, 0.5),\n"
126: "    (%s2)(0.0, 0.5)};\n"
127: "\n", numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
128:     }
129:   } else if (spatial_dim == 3) {
130:   }

132:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
133: "  /* Number of concurrent blocks */\n"
134: "  const int N_bl = %d;\n"
135: "\n"
136: /* Argument */
137: "  const int dim    = %d;\n"
138: /* Argument */
139: "  const int N_b    = numBasisFunctions_0;           // The number of basis functions\n"
140: "  const int N_comp = numBasisComponents_0;          // The number of basis function components\n"
141: "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
142: "  const int N_q    = numQuadraturePoints_0;         // The number of quadrature points\n"
143: "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
144: "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
145: "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
146: "  const int N_c    = N_cb * N_bc;\n"
147: "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
148: "  const int N_sqc  = N_bst / N_bt;\n"
149: "\n"
150: "  /* Calculated indices */\n"
151: "  const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);\n"
152: "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
153: "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
154: "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
155: "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
156: "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
157: "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
158: "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
159: "  const int Goffset = gidx*N_c;\n"
160: "  const int Coffset = gidx*N_c*N_bt;\n"
161: "  const int Eoffset = gidx*N_c*N_bt;\n", N_bl, spatial_dim);STRING_ERROR_CHECK("Message to short");

163:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
164: "\n"
165: "  /* Quadrature data */\n"
166: "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
167: "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
168: "  /* Geometric data */\n"
169: "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
170: "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n"
171: "  /* FEM data */\n"
172: "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n"
173: "  /* Intermediate calculations */\n"
174: "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n"
175: "  /* Output data */\n"
176: "  %s                e_i;                 // Coefficient $e_i$ of the residual\n"
177: "\n", numeric_str,
178:       numeric_str, spatial_dim,
179:       num_basis_functions * num_basis_components * num_quadrature_points,     /* size of PhiDer_i */
180:       numeric_str, num_threads, /* size of detJ */
181:       numeric_str, num_threads * spatial_dim * spatial_dim, /* size of invJ */
182:       numeric_str, num_threads * num_basis_functions * num_basis_components, /* size of u_i */
183:       numeric_str, spatial_dim, num_threads * num_quadrature_points /* size of f_1 */,
184:       numeric_str);STRING_ERROR_CHECK("Message to short");

186:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
187: "  /* These should be generated inline */\n"
188: "  /* Load quadrature weights */\n"
189: "  w = weights_0[qidx];\n"
190: "  /* Load basis tabulation \\phi_i for this cell */\n"
191: "  if (tidx < N_bt*N_q) {\n"
192: " // phi_i[tidx]    = Basis_0[tidx];\n"
193: "    phiDer_i[tidx] = BasisDerivatives_0[tidx];\n"
194: "  }\n"
195: "\n"
196: "  for (int batch = 0; batch < N_cb; ++batch) {\n"
197: "    /* Load geometry */\n"
198: "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
199: "    for (int n = 0; n < dim*dim; ++n) {\n"
200: "      const int offset = n*N_t;\n"
201: "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
202: "    }\n"
203: "    /* Load coefficients u_i for this cell */\n"
204: "    for (int n = 0; n < N_bt; ++n) {\n"
205: "      const int offset = n*N_t;\n"
206: "      u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];\n"
207: "    }\n"
208: "\n"
209: "    /* Map coefficients to values at quadrature points */\n"
210: "    for (int c = 0; c < N_sqc; ++c) {\n"
211: "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n"
212: "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n"
213: "      const int cell          = c*N_bl*N_b + blqidx;\n"
214: "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n"
215: "\n"
216: "      for (int comp = 0; comp < N_comp; ++comp) {\n"
217: "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;", numeric_str, num_basis_components, numeric_str, spatial_dim, num_basis_components);STRING_ERROR_CHECK("Message to short");

219:   if (spatial_dim == 3) {
220:     string_tail += snprintf(string_tail, end_of_buffer - string_tail, " gradU[comp].z = 0.0;");
221:     if (string_tail == end_of_buffer) {
222:       SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "String too short!");
223:     }
224:   }

226:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
227: "\n"
228: "      }\n"
229: "      /* Get field and derivatives at this quadrature point */\n"
230: "      for (int i = 0; i < N_b; ++i) {\n"
231: "        for (int comp = 0; comp < N_comp; ++comp) {\n"
232: "          const int b    = i*N_comp+comp;\n"
233: "          const int pidx = qidx*N_bt + b;\n"
234: "          const int uidx = cell*N_bt + b;\n"
235: "          %s%d   realSpaceDer;\n"
236: "\n", numeric_str, spatial_dim);STRING_ERROR_CHECK("Message to short");

238:   if (spatial_dim == 2) {
239:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
240: "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
241: "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
242: "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
243: "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n");STRING_ERROR_CHECK("Message to short");
244:   } else {
245:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
246: "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
247: "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
248: "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
249: "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
250: "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
251: "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n");STRING_ERROR_CHECK("Message to short");
252:   }

254:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
255: "        }\n"
256: "      }\n"
257: "      /* Process values at quadrature points */\n");STRING_ERROR_CHECK("Message to short");

259:   /* Process values at quadrature points as induced by the PDE operator */
260:   if (pde_op == LAPLACIAN) {
261:     string_tail += snprintf(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n");STRING_ERROR_CHECK("Message to short");
262:   } else if (spatial_dim == 2 && pde_op == ELASTICITY) {
263:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
264: "      switch (cidx) {\n"
265: "      case 0:\n"
266: "        f_1[fidx].x = 0.5*(gradU[0].x + gradU[0].x);\n"
267: "        f_1[fidx].y = 0.5*(gradU[0].y + gradU[1].x);\n"
268: "        break;\n"
269: "      case 1:\n"
270: "        f_1[fidx].x = 0.5*(gradU[1].x + gradU[0].y);\n"
271: "        f_1[fidx].y = 0.5*(gradU[1].y + gradU[1].y);\n"
272: "      }\n");STRING_ERROR_CHECK("Message to short");
273:   } else if (spatial_dim == 3 && pde_op == ELASTICITY) {
274:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
275: "      switch (cidx) {\n"
276: "      case 0:\n"
277: "        f_1[fidx].x = 0.5*(gradU[0].x + gradU[0].x);\n"
278: "        f_1[fidx].y = 0.5*(gradU[0].y + gradU[1].x);\n"
279: "        f_1[fidx].z = 0.5*(gradU[0].z + gradU[2].x);\n"
280: "        break;\n"
281: "      case 1:\n"
282: "        f_1[fidx].x = 0.5*(gradU[1].x + gradU[0].y);\n"
283: "        f_1[fidx].y = 0.5*(gradU[1].y + gradU[1].y);\n"
284: "        f_1[fidx].z = 0.5*(gradU[1].y + gradU[2].y);\n"
285: "        break;\n"
286: "      case 2:\n"
287: "        f_1[fidx].x = 0.5*(gradU[2].x + gradU[0].z);\n"
288: "        f_1[fidx].y = 0.5*(gradU[2].y + gradU[1].z);\n"
289: "        f_1[fidx].z = 0.5*(gradU[2].y + gradU[2].z);\n"
290: "      }\n");STRING_ERROR_CHECK("Message to short");
291:   } else {
292:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Combination of spatial dimension and PDE operator invalid");
293:   }

295:   if (spatial_dim == 2) {
296:     string_tail += snprintf(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; \n");STRING_ERROR_CHECK("Message to short");
297:   } else if (spatial_dim == 2) {
298:     string_tail += snprintf(string_tail, end_of_buffer - string_tail, "      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n");STRING_ERROR_CHECK("Message to short");
299:   }

301:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
302: "\n"
303: "    }\n"
304: "\n"
305: "    /* ==== TRANSPOSE THREADS ==== */\n"
306: "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
307: "\n"
308: "    /* Map values at quadrature points to coefficients */\n"
309: "    for (int c = 0; c < N_sbc; ++c) {\n"
310: "      const int cell = c*N_bl*N_q + blbidx;\n"
311: "\n"
312: "      e_i = 0.0;\n"
313: "      for (int q = 0; q < N_q; ++q) {\n"
314: "        const int pidx = q*N_bt + bidx;\n"
315: "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
316: "        %s%d   realSpaceDer;\n"
317: "\n"
318: "        // e_i += phi_i[pidx]*f_0[fidx];\n", numeric_str, spatial_dim);STRING_ERROR_CHECK("Message to short");

320:   if (spatial_dim == 2) {
321:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
322: "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
323: "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
324: "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
325: "        e_i           += realSpaceDer.y*f_1[fidx].y;\n");STRING_ERROR_CHECK("Message to short");
326:   } else {
327:     string_tail += snprintf(string_tail, end_of_buffer - string_tail,
328: "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
329: "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
330: "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
331: "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
332: "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
333: "        e_i           += realSpaceDer.z*f_1[fidx].z;\n");STRING_ERROR_CHECK("Message to short");
334:   }

336:   string_tail += snprintf(string_tail, end_of_buffer - string_tail,
337: "      }\n"
338: "      /* Write element vector for N_{cbc} cells at a time */\n"
339: "      elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;\n"
340: "    }\n"
341: "    /* ==== Could do one write per batch ==== */\n"
342: "  }\n"
343: "  return;\n"
344: "}  \n");STRING_ERROR_CHECK("Message to short");

346:   return(0);
347: }


350: /* Struct collecting information for a typical OpenCL environment (one platform, one device, one context, one queue) */
351: typedef struct OpenCLEnvironment_s
352: {
353:   cl_platform_id    pf_id;
354:   cl_device_id      dev_id;
355:   cl_context        ctx_id;
356:   cl_command_queue  queue_id;
357: } OpenCLEnvironment;

359: // Calculate a conforming thread grid for N kernels
362: PetscErrorCode initializeOpenCL(OpenCLEnvironment * ocl_env)
363: {
364:   cl_uint            num_platforms;
365:   cl_platform_id     platform_ids[42];
366:   cl_uint            num_devices;
367:   cl_device_id       device_ids[42];
368:   cl_int             ierr;

371:   /* Init Platform */
372:   clGetPlatformIDs(42, platform_ids, &num_platforms);
373:   if (num_platforms == 0) {
374:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "No OpenCL platform found.");
375:   }
376:   ocl_env->pf_id = platform_ids[0];

378:   /* Init Device */
379:   clGetDeviceIDs(ocl_env->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
380:   if (num_platforms == 0) {
381:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "No OpenCL device found.");
382:   }
383:   ocl_env->dev_id = device_ids[0];

385:   /* Create context with one command queue */
386:   ocl_env->ctx_id   = clCreateContext(0, 1, &(device_ids[0]), NULL, NULL, &ierr);
387:   ocl_env->queue_id = clCreateCommandQueue(ocl_env->ctx_id, ocl_env->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr);
388:   return(0);
389: }

393: PetscErrorCode destroyOpenCL(OpenCLEnvironment * ocl_env)
394: {
395:   cl_int             ierr;

398:   clReleaseCommandQueue(ocl_env->queue_id);
399:   ocl_env->queue_id = 0;

401:   clReleaseContext(ocl_env->ctx_id);
402:   ocl_env->ctx_id = 0;
403:   return(0);
404: }

406: // Calculate a conforming thread grid for N kernels
409: PetscErrorCode calculateGridOpenCL(const int N, const int blockSize, unsigned int * x, unsigned int * y, unsigned int * z)
410: {
412:   *z = 1;
413:   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
414:   const int Nblocks = N/blockSize;
415:   for (*x = (int) (sqrt(Nblocks) + 0.5); *x > 0; --*x) {
416:     *y = Nblocks / *x;
417:     if (*x * *y == Nblocks) break;
418:   }
419:   if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
420:   return(0);
421: }

425: /*
426:   IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature

428:   Input Parameters:
429: + Ne - The total number of cells, Nchunk * Ncb * Nbc
430: . Ncb - The number of serial cell batches
431: . Nbc - The number of cells per batch
432: . Nbl - The number of concurrent cells blocks per thread block
433: . coefficients - An array of the solution vector for each cell
434: . jacobianInverses - An array of the inverse Jacobian for each cell
435: . jacobianDeterminants - An array of the Jacobian determinant for each cell
436: . event - A PetscEvent, used to log flops
437: - debug - A flag for debugging information

439:   Output Parameter:
440: . elemVec - An array of the element vectors for each cell
441: */
442: PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[],
443:                                                      const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
444:                                                      PetscLogEvent event, PetscInt debug, PetscInt pde_op)
445: {
446:   const cl_int numQuadraturePoints_0 = 1;

448:   const cl_int numBasisFunctions_0 = 3;
449:   const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim;

451:   const cl_int dim    = spatial_dim;
452:   const cl_int N_b    = numBasisFunctions_0;   /* The number of basis functions */
453:   const cl_int N_comp = numBasisComponents_0;  /* The number of basis function components */
454:   const cl_int N_bt   = N_b*N_comp;            /* The total number of scalar basis functions */
455:   const cl_int N_q    = numQuadraturePoints_0; /* The number of quadrature points */
456:   const cl_int N_bst  = N_bt*N_q;              /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
457:   const cl_int N_t    = N_bst*N_bl;            /* The number of threads, N_bst * N_bl */

459:   char            *program_buffer;
460:   char            build_buffer[8192];
461:   cl_build_status status;

463:   cl_event          ocl_ev;         /* The event for tracking kernel execution */
464:   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
465:   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */

467:   cl_mem            d_coefficients;
468:   cl_mem            d_jacobianInverses;
469:   cl_mem            d_jacobianDeterminants;
470:   cl_mem            d_elemVec;

472:   OpenCLEnvironment ocl_env;
473:   cl_program        ocl_prog;
474:   cl_kernel         ocl_kernel;
475:   size_t            ocl_source_length;
476:   size_t            local_work_size[3];
477:   size_t            global_work_size[3];
478:   size_t            i;
479:   unsigned int      x, y, z;
480:   PetscErrorCode    ierr;
481:   cl_int            ierr2;


485:   initializeOpenCL(&ocl_env);
486:   PetscMalloc1(8192, &program_buffer);
487:   generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);
488:   ocl_source_length = strlen(program_buffer);
489:   ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2);
490:   clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL);
491:   if (ierr != CL_SUCCESS) {
492:     clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL);
493:     printf("Build failed! Log:\n %s", build_buffer);
494:   }
495: 
496:   PetscFree(program_buffer);

498:   ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);

500:   if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
501:   if (!Ne) {
502:     PetscStageLog     stageLog;
503:     PetscEventPerfLog eventLog = NULL;
504:     PetscInt          stage;

506:     PetscLogGetStageLog(&stageLog);
507:     PetscStageLogGetCurrent(stageLog, &stage);
508:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
509:     /* Log performance info */
510:     eventLog->eventInfo[event].count++;
511:     eventLog->eventInfo[event].time  += 0.0;
512:     eventLog->eventInfo[event].flops += 0;
513:     return(0);
514:   }

516:   /* Create buffers on the device and send data over */
517:   d_coefficients         = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * sizeof(PetscReal), (void*)coefficients,         &ierr);
518:   d_jacobianInverses     = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses,     &ierr);
519:   d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne         * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);
520:   d_elemVec              = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE,                        Ne*N_bt    * sizeof(PetscReal), NULL,                        &ierr);

522:   /* Work size preparations */
523:   calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);
524:   local_work_size[0] = Nbc*N_comp;
525:   local_work_size[1] = 1;
526:   local_work_size[2] = 1;
527:   global_work_size[0] = x * local_work_size[0];
528:   global_work_size[1] = y * local_work_size[1];
529:   global_work_size[2] = z * local_work_size[2];

531:   /* if (debug) { */
532:   PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
533:                      x, y, z,
534:                      local_work_size[0], local_work_size[1], local_work_size[2], Ncb);
535:   PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
536:   /* } */

538:   /* Kernel launch */
539:   /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */
540:   clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);
541:   clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);
542:   clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);
543:   clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);
544:   clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);

546:   clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);

548:   /* Read data back from device */
549:   clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);

551:   {
552:     PetscStageLog     stageLog;
553:     PetscEventPerfLog eventLog = NULL;
554:     PetscInt          stage;

556:     PetscLogGetStageLog(&stageLog);
557:     PetscStageLogGetCurrent(stageLog, &stage);
558:     PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
559:     /* Log performance info */
560:     clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
561:     clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);
562:     eventLog->eventInfo[event].count++;
563:     eventLog->eventInfo[event].time  += (ns_end - ns_start)*1.0e-9;
564:     eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
565:   }

567:   /* We are done, clean up */
568:   clReleaseMemObject(d_coefficients);
569:   clReleaseMemObject(d_jacobianInverses);
570:   clReleaseMemObject(d_jacobianDeterminants);
571:   clReleaseMemObject(d_elemVec);
572:   clReleaseKernel(ocl_kernel);
573:   clReleaseProgram(ocl_prog);
574:   destroyOpenCL(&ocl_env);
575:   return(0);
576: }