Actual source code: ex52_integrateElementOpenCL.c
petsc-dev 2014-02-02
1: #include <petscsys.h>
2: #include <stdlib.h>
4: #ifdef __APPLE__
5: #include <OpenCL/cl.h>
6: #else
7: #include <CL/cl.h>
8: #endif
10: //#define SPATIAL_DIM_0 2
12: typedef enum {LAPLACIAN = 0, ELASTICITY} OpType;
14: /* Put the OpenCL program into a source string.
15: * This allows to generate all the code at runtime, no need for external Python magic as for CUDA
16: *
17: * The code uses snprintf() to concatenate strings, as this is safer than strcat().
18: */
21: PetscErrorCode generateOpenCLSource(char **string_buffer, PetscInt buffer_length, PetscInt spatial_dim, PetscInt N_bl, PetscInt pde_op)
22: {
23: char *string_tail = *string_buffer;
24: char *end_of_buffer = *string_buffer + buffer_length;
25: PetscInt num_quadrature_points = 1;
26: PetscInt num_basis_components = (pde_op == LAPLACIAN) ? 1 : spatial_dim;
27: PetscInt num_basis_functions = 3;
28: PetscInt num_threads = num_basis_functions * num_basis_components * num_quadrature_points * N_bl; /* N_t */
30: /* dim Number of spatial dimensions: 2 */
31: /* N_b Number of basis functions: generated */
32: /* N_{bt} Number of total basis functions: N_b * N_{comp} */
33: /* N_q Number of quadrature points: generated */
34: /* N_{bs} Number of block cells LCM(N_b, N_q) */
35: /* N_{bst} Number of block cell components LCM(N_{bt}, N_q) */
36: /* N_{bl} Number of concurrent blocks generated */
37: /* N_t Number of threads: N_{bl} * N_{bs} */
38: /* N_{cbc} Number of concurrent basis cells: N_{bl} * N_q */
39: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b */
40: /* N_{sbc} Number of serial basis cells: N_{bs} / N_q */
41: /* N_{sqc} Number of serial quadrature cells: N_{bs} / N_b */
42: /* N_{cb} Number of serial cell batches: input */
43: /* N_c Number of total cells: N_{cb}*N_{t}/N_{comp} */
45: #define STRING_ERROR_CHECK(MSG) \
46: if (string_tail == end_of_buffer) {\
47: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, MSG);\
48: }
51: char float_str[] = "float";
52: char double_str[] = "double";
53: char *numeric_str = &(float_str[0]);
55: /* Enable device extension for double precision */
56: if (sizeof(PetscReal) == sizeof(double)) {
57: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
58: "#if defined(cl_khr_fp64)\n"
59: "# pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
60: "#elif defined(cl_amd_fp64)\n"
61: "# pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
62: "#endif\n");
63: numeric_str = &(double_str[0]);
64: }
66: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
67: "\n"
68: "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
69: "{\n", numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
71: if (spatial_dim == 2) {
72: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
73: " const int numQuadraturePoints_0 = %d;\n"
74: "\n"
75: " /* Quadrature points\n"
76: " - (x1,y1,x2,y2,...) */\n"
77: " const %s points_0[2] = {\n"
78: " -0.333333333333,\n"
79: " -0.333333333333};\n"
80: "\n"
81: " /* Quadrature weights\n"
82: " - (v1,v2,...) */\n"
83: " const %s weights_0[1] = {2.0};\n"
84: "\n"
85: " const int numBasisFunctions_0 = %d;\n"
86: " const int numBasisComponents_0 = %d;\n", num_quadrature_points, numeric_str, numeric_str, num_basis_functions, num_basis_components);STRING_ERROR_CHECK("Message to short");
88: if (pde_op == LAPLACIAN) {
89: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
90: "\n"
91: " /* Nodal basis function evaluations\n"
92: " - basis function is fastest varying, then point */\n"
93: " const %s Basis_0[3] = {\n"
94: " 0.333333333333,\n"
95: " 0.333333333333,\n"
96: " 0.333333333333};\n"
97: "\n"
98: " /* Nodal basis function derivative evaluations,\n"
99: " - derivative direction fastest varying, then basis function, then point */\n"
100: " const %s2 BasisDerivatives_0[3] = {\n"
101: " (%s2)(-0.5, -0.5),\n"
102: " (%s2)(0.5, 0.0),\n"
103: " (%s2)(0.0, 0.5)};\n"
104: "\n", numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
105: } else if (pde_op == ELASTICITY) {
106: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
107: "\n"
108: " /* Nodal basis function evaluations\n"
109: " - basis function is fastest varying, then point */\n"
110: " const %s Basis_0[6] = {\n"
111: " 0.333333333333,\n"
112: " 0.333333333333,\n"
113: " 0.333333333333,\n"
114: " 0.333333333333,\n"
115: " 0.333333333333,\n"
116: " 0.333333333333};\n"
117: "\n"
118: " /* Nodal basis function derivative evaluations,\n"
119: " - derivative direction fastest varying, then basis function, then point */\n"
120: " const %s2 BasisDerivatives_0[6] = {\n"
121: " (%s2)(-0.5, -0.5),\n"
122: " (%s2)(-0.5, -0.5),\n"
123: " (%s2)(0.5, 0.0),\n"
124: " (%s2)(0.5, 0.0),\n"
125: " (%s2)(0.0, 0.5),\n"
126: " (%s2)(0.0, 0.5)};\n"
127: "\n", numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);STRING_ERROR_CHECK("Message to short");
128: }
129: } else if (spatial_dim == 3) {
130: }
132: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
133: " /* Number of concurrent blocks */\n"
134: " const int N_bl = %d;\n"
135: "\n"
136: /* Argument */
137: " const int dim = %d;\n"
138: /* Argument */
139: " const int N_b = numBasisFunctions_0; // The number of basis functions\n"
140: " const int N_comp = numBasisComponents_0; // The number of basis function components\n"
141: " const int N_bt = N_b*N_comp; // The total number of scalar basis functions\n"
142: " const int N_q = numQuadraturePoints_0; // The number of quadrature points\n"
143: " const int N_bst = N_bt*N_q; // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
144: " const int N_t = N_bst*N_bl; // The number of threads, N_bst * N_bl\n"
145: " const int N_bc = N_t/N_comp; // The number of cells per batch (N_b*N_q*N_bl)\n"
146: " const int N_c = N_cb * N_bc;\n"
147: " const int N_sbc = N_bst / (N_q * N_comp);\n"
148: " const int N_sqc = N_bst / N_bt;\n"
149: "\n"
150: " /* Calculated indices */\n"
151: " const int tidx = get_local_id(0) + get_local_size(0)*get_local_id(1);\n"
152: " const int blidx = tidx / N_bst; // Block number for this thread\n"
153: " const int bidx = tidx %% N_bt; // Basis function mapped to this thread\n"
154: " const int cidx = tidx %% N_comp; // Basis component mapped to this thread\n"
155: " const int qidx = tidx %% N_q; // Quadrature point mapped to this thread\n"
156: " const int blbidx = tidx %% N_q + blidx*N_q; // Cell mapped to this thread in the basis phase\n"
157: " const int blqidx = tidx %% N_b + blidx*N_b; // Cell mapped to this thread in the quadrature phase\n"
158: " const int gidx = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
159: " const int Goffset = gidx*N_c;\n"
160: " const int Coffset = gidx*N_c*N_bt;\n"
161: " const int Eoffset = gidx*N_c*N_bt;\n", N_bl, spatial_dim);STRING_ERROR_CHECK("Message to short");
163: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
164: "\n"
165: " /* Quadrature data */\n"
166: " %s w; // $w_q$, Quadrature weight at $x_q$\n"
167: " __local %s%d phiDer_i[%d]; //[N_bt*N_q]; // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
168: " /* Geometric data */\n"
169: " __local %s detJ[%d]; //[N_t]; // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
170: " __local %s invJ[%d];//[N_t*dim*dim]; // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n"
171: " /* FEM data */\n"
172: " __local %s u_i[%d]; //[N_t*N_bt]; // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n"
173: " /* Intermediate calculations */\n"
174: " __local %s%d f_1[%d]; //[N_t*N_sqc]; // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n"
175: " /* Output data */\n"
176: " %s e_i; // Coefficient $e_i$ of the residual\n"
177: "\n", numeric_str,
178: numeric_str, spatial_dim,
179: num_basis_functions * num_basis_components * num_quadrature_points, /* size of PhiDer_i */
180: numeric_str, num_threads, /* size of detJ */
181: numeric_str, num_threads * spatial_dim * spatial_dim, /* size of invJ */
182: numeric_str, num_threads * num_basis_functions * num_basis_components, /* size of u_i */
183: numeric_str, spatial_dim, num_threads * num_quadrature_points /* size of f_1 */,
184: numeric_str);STRING_ERROR_CHECK("Message to short");
186: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
187: " /* These should be generated inline */\n"
188: " /* Load quadrature weights */\n"
189: " w = weights_0[qidx];\n"
190: " /* Load basis tabulation \\phi_i for this cell */\n"
191: " if (tidx < N_bt*N_q) {\n"
192: " // phi_i[tidx] = Basis_0[tidx];\n"
193: " phiDer_i[tidx] = BasisDerivatives_0[tidx];\n"
194: " }\n"
195: "\n"
196: " for (int batch = 0; batch < N_cb; ++batch) {\n"
197: " /* Load geometry */\n"
198: " detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
199: " for (int n = 0; n < dim*dim; ++n) {\n"
200: " const int offset = n*N_t;\n"
201: " invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
202: " }\n"
203: " /* Load coefficients u_i for this cell */\n"
204: " for (int n = 0; n < N_bt; ++n) {\n"
205: " const int offset = n*N_t;\n"
206: " u_i[offset+tidx] = coefficients[Coffset+batch*N_t*N_b+offset+tidx];\n"
207: " }\n"
208: "\n"
209: " /* Map coefficients to values at quadrature points */\n"
210: " for (int c = 0; c < N_sqc; ++c) {\n"
211: " %s u[%d]; //[N_comp]; // $u(x_q)$, Value of the field at $x_q$\n"
212: " %s%d gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n"
213: " const int cell = c*N_bl*N_b + blqidx;\n"
214: " const int fidx = (cell*N_q + qidx)*N_comp + cidx;\n"
215: "\n"
216: " for (int comp = 0; comp < N_comp; ++comp) {\n"
217: " gradU[comp].x = 0.0; gradU[comp].y = 0.0;", numeric_str, num_basis_components, numeric_str, spatial_dim, num_basis_components);STRING_ERROR_CHECK("Message to short");
219: if (spatial_dim == 3) {
220: string_tail += snprintf(string_tail, end_of_buffer - string_tail, " gradU[comp].z = 0.0;");
221: if (string_tail == end_of_buffer) {
222: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "String too short!");
223: }
224: }
226: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
227: "\n"
228: " }\n"
229: " /* Get field and derivatives at this quadrature point */\n"
230: " for (int i = 0; i < N_b; ++i) {\n"
231: " for (int comp = 0; comp < N_comp; ++comp) {\n"
232: " const int b = i*N_comp+comp;\n"
233: " const int pidx = qidx*N_bt + b;\n"
234: " const int uidx = cell*N_bt + b;\n"
235: " %s%d realSpaceDer;\n"
236: "\n", numeric_str, spatial_dim);STRING_ERROR_CHECK("Message to short");
238: if (spatial_dim == 2) {
239: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
240: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
241: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
242: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
243: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n");STRING_ERROR_CHECK("Message to short");
244: } else {
245: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
246: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
247: " gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
248: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
249: " gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
250: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
251: " gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n");STRING_ERROR_CHECK("Message to short");
252: }
254: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
255: " }\n"
256: " }\n"
257: " /* Process values at quadrature points */\n");STRING_ERROR_CHECK("Message to short");
259: /* Process values at quadrature points as induced by the PDE operator */
260: if (pde_op == LAPLACIAN) {
261: string_tail += snprintf(string_tail, end_of_buffer - string_tail, " f_1[fidx] = gradU[cidx];\n");STRING_ERROR_CHECK("Message to short");
262: } else if (spatial_dim == 2 && pde_op == ELASTICITY) {
263: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
264: " switch (cidx) {\n"
265: " case 0:\n"
266: " f_1[fidx].x = 0.5*(gradU[0].x + gradU[0].x);\n"
267: " f_1[fidx].y = 0.5*(gradU[0].y + gradU[1].x);\n"
268: " break;\n"
269: " case 1:\n"
270: " f_1[fidx].x = 0.5*(gradU[1].x + gradU[0].y);\n"
271: " f_1[fidx].y = 0.5*(gradU[1].y + gradU[1].y);\n"
272: " }\n");STRING_ERROR_CHECK("Message to short");
273: } else if (spatial_dim == 3 && pde_op == ELASTICITY) {
274: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
275: " switch (cidx) {\n"
276: " case 0:\n"
277: " f_1[fidx].x = 0.5*(gradU[0].x + gradU[0].x);\n"
278: " f_1[fidx].y = 0.5*(gradU[0].y + gradU[1].x);\n"
279: " f_1[fidx].z = 0.5*(gradU[0].z + gradU[2].x);\n"
280: " break;\n"
281: " case 1:\n"
282: " f_1[fidx].x = 0.5*(gradU[1].x + gradU[0].y);\n"
283: " f_1[fidx].y = 0.5*(gradU[1].y + gradU[1].y);\n"
284: " f_1[fidx].z = 0.5*(gradU[1].y + gradU[2].y);\n"
285: " break;\n"
286: " case 2:\n"
287: " f_1[fidx].x = 0.5*(gradU[2].x + gradU[0].z);\n"
288: " f_1[fidx].y = 0.5*(gradU[2].y + gradU[1].z);\n"
289: " f_1[fidx].z = 0.5*(gradU[2].y + gradU[2].z);\n"
290: " }\n");STRING_ERROR_CHECK("Message to short");
291: } else {
292: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "Combination of spatial dimension and PDE operator invalid");
293: }
295: if (spatial_dim == 2) {
296: string_tail += snprintf(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; \n");STRING_ERROR_CHECK("Message to short");
297: } else if (spatial_dim == 2) {
298: string_tail += snprintf(string_tail, end_of_buffer - string_tail, " f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n");STRING_ERROR_CHECK("Message to short");
299: }
301: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
302: "\n"
303: " }\n"
304: "\n"
305: " /* ==== TRANSPOSE THREADS ==== */\n"
306: " barrier(CLK_GLOBAL_MEM_FENCE);\n"
307: "\n"
308: " /* Map values at quadrature points to coefficients */\n"
309: " for (int c = 0; c < N_sbc; ++c) {\n"
310: " const int cell = c*N_bl*N_q + blbidx;\n"
311: "\n"
312: " e_i = 0.0;\n"
313: " for (int q = 0; q < N_q; ++q) {\n"
314: " const int pidx = q*N_bt + bidx;\n"
315: " const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
316: " %s%d realSpaceDer;\n"
317: "\n"
318: " // e_i += phi_i[pidx]*f_0[fidx];\n", numeric_str, spatial_dim);STRING_ERROR_CHECK("Message to short");
320: if (spatial_dim == 2) {
321: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
322: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
323: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
324: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
325: " e_i += realSpaceDer.y*f_1[fidx].y;\n");STRING_ERROR_CHECK("Message to short");
326: } else {
327: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
328: " realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
329: " e_i += realSpaceDer.x*f_1[fidx].x;\n"
330: " realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
331: " e_i += realSpaceDer.y*f_1[fidx].y;\n"
332: " realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
333: " e_i += realSpaceDer.z*f_1[fidx].z;\n");STRING_ERROR_CHECK("Message to short");
334: }
336: string_tail += snprintf(string_tail, end_of_buffer - string_tail,
337: " }\n"
338: " /* Write element vector for N_{cbc} cells at a time */\n"
339: " elemVec[Eoffset+(batch*N_sbc+c)*N_t+tidx] = e_i;\n"
340: " }\n"
341: " /* ==== Could do one write per batch ==== */\n"
342: " }\n"
343: " return;\n"
344: "} \n");STRING_ERROR_CHECK("Message to short");
346: return(0);
347: }
350: /* Struct collecting information for a typical OpenCL environment (one platform, one device, one context, one queue) */
351: typedef struct OpenCLEnvironment_s
352: {
353: cl_platform_id pf_id;
354: cl_device_id dev_id;
355: cl_context ctx_id;
356: cl_command_queue queue_id;
357: } OpenCLEnvironment;
359: // Calculate a conforming thread grid for N kernels
362: PetscErrorCode initializeOpenCL(OpenCLEnvironment * ocl_env)
363: {
364: cl_uint num_platforms;
365: cl_platform_id platform_ids[42];
366: cl_uint num_devices;
367: cl_device_id device_ids[42];
368: cl_int ierr;
371: /* Init Platform */
372: clGetPlatformIDs(42, platform_ids, &num_platforms);
373: if (num_platforms == 0) {
374: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "No OpenCL platform found.");
375: }
376: ocl_env->pf_id = platform_ids[0];
378: /* Init Device */
379: clGetDeviceIDs(ocl_env->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
380: if (num_platforms == 0) {
381: SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_SUP, "No OpenCL device found.");
382: }
383: ocl_env->dev_id = device_ids[0];
385: /* Create context with one command queue */
386: ocl_env->ctx_id = clCreateContext(0, 1, &(device_ids[0]), NULL, NULL, &ierr);
387: ocl_env->queue_id = clCreateCommandQueue(ocl_env->ctx_id, ocl_env->dev_id, CL_QUEUE_PROFILING_ENABLE, &ierr);
388: return(0);
389: }
393: PetscErrorCode destroyOpenCL(OpenCLEnvironment * ocl_env)
394: {
395: cl_int ierr;
398: clReleaseCommandQueue(ocl_env->queue_id);
399: ocl_env->queue_id = 0;
401: clReleaseContext(ocl_env->ctx_id);
402: ocl_env->ctx_id = 0;
403: return(0);
404: }
406: // Calculate a conforming thread grid for N kernels
409: PetscErrorCode calculateGridOpenCL(const int N, const int blockSize, unsigned int * x, unsigned int * y, unsigned int * z)
410: {
412: *z = 1;
413: if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
414: const int Nblocks = N/blockSize;
415: for (*x = (int) (sqrt(Nblocks) + 0.5); *x > 0; --*x) {
416: *y = Nblocks / *x;
417: if (*x * *y == Nblocks) break;
418: }
419: if (*x * *y != Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %d with block size %d", N, blockSize);
420: return(0);
421: }
425: /*
426: IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature
428: Input Parameters:
429: + Ne - The total number of cells, Nchunk * Ncb * Nbc
430: . Ncb - The number of serial cell batches
431: . Nbc - The number of cells per batch
432: . Nbl - The number of concurrent cells blocks per thread block
433: . coefficients - An array of the solution vector for each cell
434: . jacobianInverses - An array of the inverse Jacobian for each cell
435: . jacobianDeterminants - An array of the Jacobian determinant for each cell
436: . event - A PetscEvent, used to log flops
437: - debug - A flag for debugging information
439: Output Parameter:
440: . elemVec - An array of the element vectors for each cell
441: */
442: PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[],
443: const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
444: PetscLogEvent event, PetscInt debug, PetscInt pde_op)
445: {
446: const cl_int numQuadraturePoints_0 = 1;
448: const cl_int numBasisFunctions_0 = 3;
449: const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim;
451: const cl_int dim = spatial_dim;
452: const cl_int N_b = numBasisFunctions_0; /* The number of basis functions */
453: const cl_int N_comp = numBasisComponents_0; /* The number of basis function components */
454: const cl_int N_bt = N_b*N_comp; /* The total number of scalar basis functions */
455: const cl_int N_q = numQuadraturePoints_0; /* The number of quadrature points */
456: const cl_int N_bst = N_bt*N_q; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
457: const cl_int N_t = N_bst*N_bl; /* The number of threads, N_bst * N_bl */
459: char *program_buffer;
460: char build_buffer[8192];
461: cl_build_status status;
463: cl_event ocl_ev; /* The event for tracking kernel execution */
464: cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */
465: cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */
467: cl_mem d_coefficients;
468: cl_mem d_jacobianInverses;
469: cl_mem d_jacobianDeterminants;
470: cl_mem d_elemVec;
472: OpenCLEnvironment ocl_env;
473: cl_program ocl_prog;
474: cl_kernel ocl_kernel;
475: size_t ocl_source_length;
476: size_t local_work_size[3];
477: size_t global_work_size[3];
478: size_t i;
479: unsigned int x, y, z;
480: PetscErrorCode ierr;
481: cl_int ierr2;
485: initializeOpenCL(&ocl_env);
486: PetscMalloc1(8192, &program_buffer);
487: generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);
488: ocl_source_length = strlen(program_buffer);
489: ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2);
490: clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL);
491: if (ierr != CL_SUCCESS) {
492: clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL);
493: printf("Build failed! Log:\n %s", build_buffer);
494: }
495:
496: PetscFree(program_buffer);
498: ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);
500: if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
501: if (!Ne) {
502: PetscStageLog stageLog;
503: PetscEventPerfLog eventLog = NULL;
504: PetscInt stage;
506: PetscLogGetStageLog(&stageLog);
507: PetscStageLogGetCurrent(stageLog, &stage);
508: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
509: /* Log performance info */
510: eventLog->eventInfo[event].count++;
511: eventLog->eventInfo[event].time += 0.0;
512: eventLog->eventInfo[event].flops += 0;
513: return(0);
514: }
516: /* Create buffers on the device and send data over */
517: d_coefficients = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt * sizeof(PetscReal), (void*)coefficients, &ierr);
518: d_jacobianInverses = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses, &ierr);
519: d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);
520: d_elemVec = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE, Ne*N_bt * sizeof(PetscReal), NULL, &ierr);
522: /* Work size preparations */
523: calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);
524: local_work_size[0] = Nbc*N_comp;
525: local_work_size[1] = 1;
526: local_work_size[2] = 1;
527: global_work_size[0] = x * local_work_size[0];
528: global_work_size[1] = y * local_work_size[1];
529: global_work_size[2] = z * local_work_size[2];
531: /* if (debug) { */
532: PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
533: x, y, z,
534: local_work_size[0], local_work_size[1], local_work_size[2], Ncb);
535: PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
536: /* } */
538: /* Kernel launch */
539: /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */
540: clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);
541: clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);
542: clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);
543: clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);
544: clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);
546: clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);
548: /* Read data back from device */
549: clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);
551: {
552: PetscStageLog stageLog;
553: PetscEventPerfLog eventLog = NULL;
554: PetscInt stage;
556: PetscLogGetStageLog(&stageLog);
557: PetscStageLogGetCurrent(stageLog, &stage);
558: PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
559: /* Log performance info */
560: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
561: clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);
562: eventLog->eventInfo[event].count++;
563: eventLog->eventInfo[event].time += (ns_end - ns_start)*1.0e-9;
564: eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
565: }
567: /* We are done, clean up */
568: clReleaseMemObject(d_coefficients);
569: clReleaseMemObject(d_jacobianInverses);
570: clReleaseMemObject(d_jacobianDeterminants);
571: clReleaseMemObject(d_elemVec);
572: clReleaseKernel(ocl_kernel);
573: clReleaseProgram(ocl_prog);
574: destroyOpenCL(&ocl_env);
575: return(0);
576: }