Actual source code: landaucu.cu
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <petscmat.h>
10: #include <petsccublas.h>
12: // hack to avoid configure problems in CI. Delete when resolved
13: #if !defined (PETSC_HAVE_CUDA_ATOMIC)
14: #define atomicAdd(e, f) (*e) += f
15: #endif
16: #define PETSC_DEVICE_FUNC_DECL __device__
17: #include "../land_tensors.h"
18: #include <petscaijdevice.h>
20: #define CHECK_LAUNCH_ERROR() \
21: do { \
22: /* Check synchronous errors, i.e. pre-launch */ \
23: cudaError_t err = cudaGetLastError(); \
24: if (cudaSuccess != err) { \
25: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
26: } \
27: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
28: err = cudaDeviceSynchronize(); \
29: if (cudaSuccess != err) { \
30: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
31: } \
32: } while (0)
34: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps *maps, pointInterpolationP4est (*points)[LANDAU_MAX_Q_FACE], PetscInt Nf, PetscInt Nq)
35: {
36: P4estVertexMaps h_maps;
37: cudaError_t cerr;
39: h_maps.num_elements =maps->num_elements;
40: h_maps.num_face = maps->num_face;
41: h_maps.num_reduced = maps->num_reduced;
42: h_maps.deviceType = maps->deviceType;
43: h_maps.Nf = Nf;
44: h_maps.Nq = Nq;
45: cerr = cudaMalloc((void **)&h_maps.c_maps, maps->num_reduced * sizeof *points);CHKERRCUDA(cerr);
46: cerr = cudaMemcpy( h_maps.c_maps, maps->c_maps, maps->num_reduced * sizeof *points, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
47: cerr = cudaMalloc((void **)&h_maps.gIdx, maps->num_elements * sizeof *maps->gIdx);CHKERRCUDA(cerr);
48: cerr = cudaMemcpy( h_maps.gIdx, maps->gIdx, maps->num_elements * sizeof *maps->gIdx, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
49: cerr = cudaMalloc((void **)&maps->data, sizeof(P4estVertexMaps));CHKERRCUDA(cerr);
50: cerr = cudaMemcpy( maps->data, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
51: return(0);
52: }
54: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps *pMaps)
55: {
56: P4estVertexMaps *d_maps = pMaps->data, h_maps;
57: cudaError_t cerr;
59: cerr = cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
60: cerr = cudaFree(h_maps.c_maps);CHKERRCUDA(cerr);
61: cerr = cudaFree(h_maps.gIdx);CHKERRCUDA(cerr);
62: cerr = cudaFree(d_maps);CHKERRCUDA(cerr);
63: return(0);
64: }
66: // The GPU Landau kernel
67: //
68: __global__
69: void landau_form_fdf(const PetscInt nip, const PetscInt dim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJ_a[],
70: const PetscReal * const BB, const PetscReal * const DD, LandauIPReal *IPDataRaw, LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[],
71: #if LANDAU_DIM==3
72: LandauIPReal d_dfdz[],
73: #endif
74: PetscErrorCode *ierr) // output
75: {
76: const PetscInt Nq = blockDim.x, myelem = blockIdx.x;
77: const PetscInt myQi = threadIdx.x;
78: const PetscInt jpidx = myQi + myelem * Nq;
79: const PetscReal *invJ = &invJ_a[jpidx*dim*dim];
80: const PetscReal *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
81: // un pack IPData
82: LandauIPReal *IPData_coefs = &IPDataRaw[nip*(dim+1)];
83: LandauIPReal *coef = &IPData_coefs[myelem*Nb*Nf];
84: PetscInt f,d,b,e;
85: PetscScalar u_x[LANDAU_MAX_SPECIES][LANDAU_DIM];
86: *0;
87: /* get f and df */
88: for (f = 0; f < Nf; ++f) {
89: PetscScalar refSpaceDer[LANDAU_DIM];
90: d_f[jpidx + f*nip] = 0.0;
91: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
92: for (b = 0; b < Nb; ++b) {
93: const PetscInt cidx = b;
94: d_f[jpidx + f*nip] += Bq[cidx]*coef[f*Nb+cidx];
95: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*coef[f*Nb+cidx];
96: }
97: for (d = 0; d < dim; ++d) {
98: for (e = 0, u_x[f][d] = 0.0; e < dim; ++e) {
99: u_x[f][d] += invJ[e*dim+d]*refSpaceDer[e];
100: }
101: }
102: }
103: for (f=0;f<Nf;f++) {
104: d_dfdx[jpidx + f*nip] = PetscRealPart(u_x[f][0]);
105: d_dfdy[jpidx + f*nip] = PetscRealPart(u_x[f][1]);
106: #if LANDAU_DIM==3
107: d_dfdz[jpidx + f*nip] = PetscRealPart(u_x[f][2]);
108: #endif
109: }
110: }
112: __device__ void
113: landau_inner_integral_v2(const PetscInt myQi, const PetscInt jpidx, PetscInt nip, const PetscInt Nq, const PetscInt Nf, const PetscInt Nb,
114: const PetscInt dim, LandauIPReal *IPDataRaw, const PetscReal invJj[], const PetscReal nu_alpha[],
115: const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
116: const PetscReal * const BB, const PetscReal * const DD,
117: PetscScalar *elemMat, P4estVertexMaps *d_maps, PetscSplitCSRDataStructure *d_mat, // output
118: PetscScalar fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
119: PetscReal g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
120: PetscReal g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
121: PetscReal gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
122: PetscReal gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
123: PetscReal s_nu_alpha[],
124: PetscReal s_nu_beta[],
125: PetscReal s_invMass[],
126: PetscReal s_f[],
127: PetscReal s_dfx[],
128: PetscReal s_dfy[],
129: LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[], // global memory
130: #if LANDAU_DIM==3
131: PetscReal s_dfz[], LandauIPReal d_dfdz[],
132: #endif
133: PetscReal d_mass_w[], PetscReal shift,
134: PetscInt myelem, PetscErrorCode *ierr)
135: {
136: int delta,d,f,g,d2,dp,d3,fieldA,ipidx_b,nip_pad = nip; // vectorization padding not supported;
137: *0;
138: if (!d_mass_w) { // get g2 & g3
139: PetscReal gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
140: LandauIPData IPData;
141: // create g2 & g3
142: for (f=threadIdx.x; f<Nf; f+=blockDim.x) {
143: for (d=0;d<dim;d++) { // clear accumulation data D & K
144: gg2[d][myQi][f] = 0;
145: for (d2=0;d2<dim;d2++) gg3[d][d2][myQi][f] = 0;
146: }
147: }
148: if (threadIdx.y == 0) {
149: for (int i = threadIdx.x; i < Nf; i += blockDim.x) {
150: s_nu_alpha[i] = nu_alpha[i];
151: s_nu_beta[i] = nu_beta[i];
152: s_invMass[i] = invMass[i];
153: }
154: }
155: for (d2 = 0; d2 < dim; d2++) {
156: gg2_temp[d2] = 0;
157: for (d3 = 0; d3 < dim; d3++) {
158: gg3_temp[d2][d3] = 0;
159: }
160: }
161: __syncthreads();
162: // un pack IPData
163: IPData.w = IPDataRaw;
164: IPData.x = IPDataRaw + 1*nip_pad;
165: IPData.y = IPDataRaw + 2*nip_pad;
166: IPData.z = IPDataRaw + 3*nip_pad;
168: for (ipidx_b = 0; ipidx_b < nip; ipidx_b += blockDim.x) {
169: const PetscReal vj[3] = {IPData.x[jpidx], IPData.y[jpidx], IPData.z ? IPData.z[jpidx] : 0};
170: int ipidx = ipidx_b + threadIdx.x;
172: __syncthreads();
173: if (ipidx < nip) {
174: for (fieldA = threadIdx.y; fieldA < Nf; fieldA += blockDim.y) {
175: s_f [fieldA*blockDim.x+threadIdx.x] = d_f[ipidx + fieldA*nip_pad];
176: s_dfx[fieldA*blockDim.x+threadIdx.x] = d_dfdx[ipidx + fieldA*nip_pad];
177: s_dfy[fieldA*blockDim.x+threadIdx.x] = d_dfdy[ipidx + fieldA*nip_pad];
178: #if LANDAU_DIM==3
179: s_dfz[fieldA*blockDim.x+threadIdx.x] = d_dfdz[ipidx + fieldA*nip_pad];
180: #endif
181: }
182: }
183: __syncthreads();
184: if (ipidx < nip) {
185: const PetscReal wi = IPData.w[ipidx], x = IPData.x[ipidx], y = IPData.y[ipidx];
186: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
187: #if LANDAU_DIM==2
188: PetscReal Ud[2][2], Uk[2][2];
189: LandauTensor2D(vj, x, y, Ud, Uk, (ipidx==jpidx) ? 0. : 1.);
190: #else
191: PetscReal U[3][3], z = IPData.z[ipidx];
192: LandauTensor3D(vj, x, y, z, U, (ipidx==jpidx) ? 0. : 1.);
193: #endif
194: for (fieldA = 0; fieldA < Nf; fieldA++) {
195: temp1[0] += s_dfx[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
196: temp1[1] += s_dfy[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
197: #if LANDAU_DIM==3
198: temp1[2] += s_dfz[fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA]*s_invMass[fieldA];
199: #endif
200: temp2 += s_f [fieldA*blockDim.x+threadIdx.x]*s_nu_beta[fieldA];
201: }
202: temp1[0] *= wi;
203: temp1[1] *= wi;
204: #if LANDAU_DIM==3
205: temp1[2] *= wi;
206: #endif
207: temp2 *= wi;
208: #if LANDAU_DIM==2
209: for (d2 = 0; d2 < 2; d2++) {
210: for (d3 = 0; d3 < 2; ++d3) {
211: /* K = U * grad(f): g2=e: i,A */
212: gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
213: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
214: gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
215: }
216: }
217: #else
218: for (d2 = 0; d2 < 3; ++d2) {
219: for (d3 = 0; d3 < 3; ++d3) {
220: /* K = U * grad(f): g2 = e: i,A */
221: gg2_temp[d2] += U[d2][d3]*temp1[d3];
222: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
223: gg3_temp[d2][d3] += U[d2][d3]*temp2;
224: }
225: }
226: #endif
227: }
228: } /* IPs */
230: /* reduce gg temp sums across threads */
231: for (delta = blockDim.x/2; delta > 0; delta /= 2) {
232: for (d2 = 0; d2 < dim; d2++) {
233: gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
234: for (d3 = 0; d3 < dim; d3++) {
235: gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
236: }
237: }
238: }
240: // add alpha and put in gg2/3
241: for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
242: for (d2 = 0; d2 < dim; d2++) {
243: gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA];
244: for (d3 = 0; d3 < dim; d3++) {
245: gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA]*s_invMass[fieldA];
246: }
247: }
248: }
249: __syncthreads();
251: /* add electric field term once per IP */
252: for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
253: gg2[dim-1][myQi][fieldA] += Eq_m[fieldA];
254: }
255: __syncthreads();
256: //intf("%d %d gg2[1][1]=%g\n",myelem,qj_start,gg2[1][dim-1]);
257: /* Jacobian transform - g2 */
258: for (fieldA = threadIdx.x; fieldA < Nf; fieldA += blockDim.x) {
259: PetscReal wj = IPData.w[jpidx];
260: for (d = 0; d < dim; ++d) {
261: g2[d][myQi][fieldA] = 0.0;
262: for (d2 = 0; d2 < dim; ++d2) {
263: g2[d][myQi][fieldA] += invJj[d*dim+d2]*gg2[d2][myQi][fieldA];
264: g3[d][d2][myQi][fieldA] = 0.0;
265: for (d3 = 0; d3 < dim; ++d3) {
266: for (dp = 0; dp < dim; ++dp) {
267: g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
268: }
269: }
270: g3[d][d2][myQi][fieldA] *= wj;
271: }
272: g2[d][myQi][fieldA] *= wj;
273: }
274: }
275: __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
276: } // !mass_w
277: /* FE matrix construction */
278: {
279: int fieldA,d,qj,d2,q,idx,totDim=Nb*Nf;
280: /* assemble */
281: for (fieldA = 0; fieldA < Nf; fieldA++) {
282: if (fieldMats) {
283: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
284: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
285: fieldMats[f][g] = 0;
286: }
287: }
288: }
289: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
290: const PetscInt i = fieldA*Nb + f; /* Element matrix row */
291: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
292: const PetscInt j = fieldA*Nb + g; /* Element matrix column */
293: const PetscInt fOff = i*totDim + j;
294: PetscScalar t = elemMat ? elemMat[fOff] : fieldMats[f][g];
295: for (qj = 0 ; qj < Nq ; qj++) {
296: const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
297: if (!d_mass_w) {
298: for (d = 0; d < dim; ++d) {
299: t += DIq[f*dim+d]*g2[d][qj][fieldA]*BJq[g];
300: for (d2 = 0; d2 < dim; ++d2) {
301: t += DIq[f*dim + d]*g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
302: }
303: }
304: } else {
305: const PetscInt jpidx = qj + myelem * Nq;
306: t += BJq[f] * d_mass_w[jpidx]*shift * BJq[g];
307: }
308: }
309: if (elemMat) elemMat[fOff] = t;
310: else fieldMats[f][g] = t;
311: }
312: }
313: if (fieldMats) {
314: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
315: PetscReal row_scale[LANDAU_MAX_Q_FACE],col_scale[LANDAU_MAX_Q_FACE];
316: PetscInt nr,nc,rows0[LANDAU_MAX_Q_FACE],cols0[LANDAU_MAX_Q_FACE],rows[LANDAU_MAX_Q_FACE],cols[LANDAU_MAX_Q_FACE];
317: const LandauIdx *const Idxs = &d_maps->gIdx[myelem][fieldA][0];
318: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
319: idx = Idxs[f];
320: if (idx >= 0) {
321: nr = 1;
322: rows0[0] = idx;
323: row_scale[0] = 1.;
324: } else {
325: idx = -idx - 1;
326: nr = d_maps->num_face;
327: for (q = 0; q < d_maps->num_face; q++) {
328: rows0[q] = d_maps->c_maps[idx][q].gid;
329: row_scale[q] = d_maps->c_maps[idx][q].scale;
330: }
331: }
332: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
333: idx = Idxs[g];
334: if (idx >= 0) {
335: nc = 1;
336: cols0[0] = idx;
337: col_scale[0] = 1.;
338: } else {
339: idx = -idx - 1;
340: nc = d_maps->num_face;
341: for (q = 0; q < d_maps->num_face; q++) {
342: cols0[q] = d_maps->c_maps[idx][q].gid;
343: col_scale[q] = d_maps->c_maps[idx][q].scale;
344: }
345: }
346: for (q = 0; q < nr; q++) rows[q] = rows0[q];
347: for (q = 0; q < nc; q++) cols[q] = cols0[q];
348: for (q = 0; q < nr; q++) {
349: for (d = 0; d < nc; d++) {
350: vals[q*nc + d] = row_scale[q]*col_scale[d]*fieldMats[f][g];
351: }
352: }
353: MatSetValuesDevice(d_mat,nr,rows,nc,cols,vals,ADD_VALUES,ierr);
354: if (*ierr) return;
355: }
356: }
357: }
358: }
359: }
360: }
362: //
363: // The GPU Landau kernel
364: //
365: __global__
366: void __launch_bounds__(256,1) landau_kernel_v2(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
367: const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
368: const PetscReal * const BB, const PetscReal * const DD, LandauIPReal *IPDataRaw,
369: PetscScalar elemMats_out[], P4estVertexMaps *d_maps, PetscSplitCSRDataStructure *d_mat, LandauIPReal d_f[], LandauIPReal d_dfdx[], LandauIPReal d_dfdy[],
370: #if LANDAU_DIM==3
371: LandauIPReal d_dfdz[],
372: #endif
373: PetscReal d_mass_w[], PetscReal shift,
374: PetscErrorCode *ierr)
375: {
376: const PetscInt Nq = blockDim.y, myelem = blockIdx.x;
377: extern __shared__ PetscReal smem[];
378: int size = 0;
379: PetscReal (*g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] = // shared mem not needed when mass_w
380: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
381: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
382: PetscReal (*g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
383: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
384: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
385: PetscReal (*gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
386: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
387: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
388: PetscReal (*gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
389: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
390: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
391: PetscReal *s_nu_alpha = &smem[size];
392: size += LANDAU_MAX_SPECIES;
393: PetscReal *s_nu_beta = &smem[size];
394: size += LANDAU_MAX_SPECIES;
395: PetscReal *s_invMass = &smem[size];
396: size += LANDAU_MAX_SPECIES;
397: PetscReal *s_f = &smem[size];
398: size += blockDim.x*LANDAU_MAX_SPECIES;
399: PetscReal *s_dfx = &smem[size];
400: size += blockDim.x*LANDAU_MAX_SPECIES;
401: PetscReal *s_dfy = &smem[size];
402: size += blockDim.x*LANDAU_MAX_SPECIES;
403: #if LANDAU_DIM==3
404: PetscReal *s_dfz = &smem[size];
405: size += blockDim.x*LANDAU_MAX_SPECIES;
406: #endif
407: PetscScalar (*fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ] = d_maps ?
408: (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size] : NULL;
409: if (d_maps) size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
410: const PetscInt myQi = threadIdx.y;
411: const PetscInt jpidx = myQi + myelem * Nq;
412: //const PetscInt subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
413: PetscScalar *elemMat = elemMats_out ? &elemMats_out[myelem*totDim*totDim] : NULL; /* my output */
414: int tid = threadIdx.x + threadIdx.y*blockDim.x;
415: const PetscReal *invJ = invJj ? &invJj[jpidx*dim*dim] : NULL;
416: if (elemMat) for (int i = tid; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
417: __syncthreads();
419: landau_inner_integral_v2(myQi, jpidx, nip, Nq, Nf, Nb, dim, IPDataRaw, invJ, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
420: elemMat, d_maps, d_mat, *fieldMats, *g2, *g3, *gg2, *gg3, s_nu_alpha, s_nu_beta, s_invMass, s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
421: #if LANDAU_DIM==3
422: s_dfz, d_dfdz,
423: #endif
424: d_mass_w, shift,
425: myelem, ierr); /* compact */
426: }
428: PetscErrorCode LandauCUDAJacobian(DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
429: const LandauIPData *const IPData, const PetscReal invJj[], PetscReal *mass_w, PetscReal shift, const PetscLogEvent events[], Mat JacP)
430: {
431: PetscErrorCode ierr,*d_ierr;
432: cudaError_t cerr;
433: PetscInt ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(LandauIPReal),ipdatasz;
434: PetscReal *d_BB,*d_DD,*d_invJj=NULL,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m,*d_mass_w=NULL;
435: PetscScalar *d_elemMats=NULL;
436: LandauIPReal *d_f=NULL, *d_dfdx=NULL, *d_dfdy=NULL;
437: #if LANDAU_DIM==3
438: PetscScalar *d_dfdz=NULL;
439: #endif
440: PetscTabulation *Tf;
441: PetscDS prob;
442: PetscSection section, globalSection;
443: LandauIPReal *d_IPDataRaw=NULL;
444: LandauCtx *ctx;
445: PetscSplitCSRDataStructure *d_mat=NULL;
446: P4estVertexMaps *h_maps, *d_maps=NULL;
447: int nnn = 256/Nq;
450: while (nnn & nnn - 1) nnn = nnn & nnn - 1;
451: if (nnn>16) nnn = 16;
452: PetscLogEventBegin(events[3],0,0,0,0);
453: DMGetDimension(plex, &dim);
454: if (dim!=LANDAU_DIM) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %D != dim %d",LANDAU_DIM,dim);
455: DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
456: numGCells = cEnd - cStart;
457: nip = numGCells*Nq; /* length of inner global iteration */
458: DMGetDS(plex, &prob);
459: PetscDSGetNumFields(prob, &Nf);
460: PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
461: if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D %D",Nq,Nb);
462: PetscDSGetTotalDimension(prob, &totDim);
463: PetscDSGetTabulation(prob, &Tf);
464: DMGetLocalSection(plex, §ion);
465: DMGetGlobalSection(plex, &globalSection);
466: // create data
467: cerr = cudaMalloc((void **)&d_BB, Nq*Nb*szf);CHKERRCUDA(cerr); // kernel input
468: cerr = cudaMemcpy( d_BB, Tf[0]->T[0], Nq*Nb*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
469: cerr = cudaMalloc((void **)&d_DD, Nq*Nb*dim*szf);CHKERRCUDA(cerr); // kernel input
470: cerr = cudaMemcpy( d_DD, Tf[0]->T[1], Nq*Nb*dim*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
471: nip_dim2 = Nq*numGCells*dim*dim;
472: if (mass_w) {
473: cerr = cudaMalloc((void **)&d_mass_w, nip*szf);CHKERRCUDA(cerr); // kernel input
474: cerr = cudaMemcpy( d_mass_w, mass_w,nip*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
475: } else {
476: ipdatasz = LandauGetIPDataSize(IPData);
477: cerr = cudaMalloc((void **)&d_IPDataRaw,ipdatasz*szf);CHKERRCUDA(cerr); // kernel input
478: cerr = cudaMemcpy(d_IPDataRaw, IPData->w, ipdatasz*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr); // assumes IPData starts with 'w'
479: cerr = cudaMalloc((void **)&d_nu_alpha, Nf*szf);CHKERRCUDA(cerr); // kernel input
480: cerr = cudaMalloc((void **)&d_nu_beta, Nf*szf);CHKERRCUDA(cerr); // kernel input
481: cerr = cudaMalloc((void **)&d_invMass, Nf*szf);CHKERRCUDA(cerr); // kernel input
482: cerr = cudaMalloc((void **)&d_Eq_m, Nf*szf);CHKERRCUDA(cerr); // kernel input
483: cerr = cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
484: cerr = cudaMemcpy(d_nu_beta, nu_beta, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
485: cerr = cudaMemcpy(d_invMass, invMass, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
486: cerr = cudaMemcpy(d_Eq_m, Eq_m, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
487: // f and df
488: cerr = cudaMalloc((void **)&d_f, nip*Nf*szf);CHKERRCUDA(cerr); // kernel input
489: cerr = cudaMalloc((void **)&d_dfdx, nip*Nf*szf);CHKERRCUDA(cerr); // kernel input
490: cerr = cudaMalloc((void **)&d_dfdy, nip*Nf*szf);CHKERRCUDA(cerr); // kernel input
491: #if LANDAU_DIM==3
492: cerr = cudaMalloc((void **)&d_dfdz, nip*Nf*szf);CHKERRCUDA(cerr); // kernel input
493: #endif
494: // collect geometry
495: cerr = cudaMalloc((void **)&d_invJj, nip_dim2*szf);CHKERRCUDA(cerr); // kernel input
496: cerr = cudaMemcpy(d_invJj, invJj, nip_dim2*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
497: }
499: DMGetApplicationContext(plex, &ctx);
500: if (!ctx) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
501: if (ctx->gpu_assembly) {
502: PetscContainer container;
503: PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
504: if (container) { // not here first call
505: PetscContainerGetPointer(container, (void **) &h_maps);
506: if (h_maps->data) {
507: d_maps = h_maps->data;
508: if (!d_maps) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata");
509: } else {
510: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
511: }
512: // this does the setup the first time called
513: MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
514: } else {
515: cerr = cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar));CHKERRCUDA(cerr); // kernel output - first call is on CPU
516: }
517: } else {
518: cerr = cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar));CHKERRCUDA(cerr); // kernel output - no GPU assembly
519: }
520: cerr = WaitForCUDA();CHKERRCUDA(cerr);
521: PetscLogEventEnd(events[3],0,0,0,0);
523: cerr = cudaMalloc((void **)&d_ierr, sizeof(ierr));CHKERRCUDA(cerr); // kernel input
524: if (!mass_w) { // form f and df
525: dim3 dimBlock(Nq,1);
526: PetscLogEventBegin(events[8],0,0,0,0);
527: PetscLogGpuTimeBegin();
528: ii = 0;
529: // PetscPrintf(PETSC_COMM_SELF, "numGCells=%d dim.x=%d Nq=%d nThreads=%d, %d kB shared mem\n",numGCells,n,Nq,Nq*n,ii*szf/1024);
530: landau_form_fdf<<<numGCells,dimBlock,ii*szf>>>( nip, dim, Nf, Nb, d_invJj, d_BB, d_DD, d_IPDataRaw, d_f, d_dfdx, d_dfdy,
531: #if LANDAU_DIM==3
532: d_dfdz,
533: #endif
534: d_ierr);
535: CHECK_LAUNCH_ERROR();
536: PetscLogGpuFlops(nip*(PetscLogDouble)(2*Nb*(1+dim)));
537: PetscLogGpuTimeEnd();
538: cerr = cudaMemcpy(&ierr, d_ierr, sizeof(ierr), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
539:
540: PetscLogEventEnd(events[8],0,0,0,0);
541: }
542: PetscLogEventBegin(events[4],0,0,0,0);
543: {
544: dim3 dimBlock(nnn,Nq);
545: PetscLogGpuTimeBegin();
546: PetscLogGpuFlops(nip*(PetscLogDouble)(mass_w ? (nip*(11*Nf+ 4*dim*dim) + 6*Nf*dim*dim*dim + 10*Nf*dim*dim + 4*Nf*dim + Nb*Nf*Nb*Nq*dim*dim*5) : Nb*Nf*Nb*Nq*4));
547: ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES;
548: ii += (LANDAU_MAX_NQ*LANDAU_MAX_NQ)*LANDAU_MAX_SPECIES;
549: if (ii*szf >= 49152) {
550: cerr = cudaFuncSetAttribute(landau_kernel_v2,
551: cudaFuncAttributeMaxDynamicSharedMemorySize,
552: 98304);CHKERRCUDA(cerr);
553: }
554: // PetscPrintf(PETSC_COMM_SELF, "numGCells=%d dim.x=%d Nq=%d nThreads=%d, %d kB shared mem\n",numGCells,n,Nq,Nq*n,ii*szf/1024);
555: landau_kernel_v2<<<numGCells,dimBlock,ii*szf>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
556: d_BB, d_DD, d_IPDataRaw, d_elemMats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
557: #if LANDAU_DIM==3
558: d_dfdz,
559: #endif
560: d_mass_w, shift,
561: d_ierr);
562: CHECK_LAUNCH_ERROR();
563: PetscLogGpuTimeEnd();
564: //cerr = cudaMemcpy(&ierr, d_ierr, sizeof(ierr), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
565: //
566: }
567: cerr = cudaFree(d_ierr);CHKERRCUDA(cerr);
568: cerr = WaitForCUDA();CHKERRCUDA(cerr);
569: PetscLogEventEnd(events[4],0,0,0,0);
570: // delete device data
571: PetscLogEventBegin(events[5],0,0,0,0);
572: cerr = cudaFree(d_BB);CHKERRCUDA(cerr);
573: cerr = cudaFree(d_DD);CHKERRCUDA(cerr);
574: if (mass_w) {
575: cerr = cudaFree(d_mass_w);CHKERRCUDA(cerr);
576: } else {
577: cerr = cudaFree(d_IPDataRaw);CHKERRCUDA(cerr);
578: cerr = cudaFree(d_f);CHKERRCUDA(cerr);
579: cerr = cudaFree(d_dfdx);CHKERRCUDA(cerr);
580: cerr = cudaFree(d_dfdy);CHKERRCUDA(cerr);
581: #if LANDAU_DIM==3
582: cerr = cudaFree(d_dfdz);CHKERRCUDA(cerr);
583: #endif
584: cerr = cudaFree(d_invJj);CHKERRCUDA(cerr);
585: cerr = cudaFree(d_nu_alpha);CHKERRCUDA(cerr);
586: cerr = cudaFree(d_nu_beta);CHKERRCUDA(cerr);
587: cerr = cudaFree(d_invMass);CHKERRCUDA(cerr);
588: cerr = cudaFree(d_Eq_m);CHKERRCUDA(cerr);
589: }
590: cerr = WaitForCUDA();CHKERRCUDA(cerr);
591: PetscLogEventEnd(events[5],0,0,0,0);
592: // First time assembly even with GPU assembly
593: if (d_elemMats) {
594: PetscScalar *elemMats=NULL,*elMat;
595: PetscLogEventBegin(events[5],0,0,0,0);
596: PetscMalloc1(totDim*totDim*numGCells,&elemMats);
597: cerr = cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
598: cerr = cudaFree(d_elemMats);CHKERRCUDA(cerr);
599: PetscLogEventEnd(events[5],0,0,0,0);
600: PetscLogEventBegin(events[6],0,0,0,0);
601: for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
602: DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
603: if (ej==-1) {
604: int d,f;
605: PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
606: for (d = 0; d < totDim; ++d){
607: for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e", PetscRealPart(elMat[d*totDim + f]));
608: PetscPrintf(PETSC_COMM_SELF,"\n");
609: }
610: }
611: }
612: PetscFree(elemMats);
613: PetscLogEventEnd(events[6],0,0,0,0);
614: }
615: return(0);
616: }