cuda: remove typedefs for floating point type. (#4192)

espressomd · Mar 30, 2021 · bf8febb · bf8febb
2 parents 90540b1 + d0e50f2
commit bf8febb
Show file tree

Hide file tree

Showing 18 changed files with 358 additions and 420 deletions.
diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
@@ -383,8 +383,6 @@ Fluid dynamics and fluid structure interaction
 
 -  ``EK_DEBUG``
 
--  ``EK_DOUBLE_PREC``
-
 
 .. _Interaction features:
 

diff --git a/src/config/features.def b/src/config/features.def
@@ -62,7 +62,6 @@ ELECTROKINETICS                 requires CUDA
 EK_BOUNDARIES                   implies ELECTROKINETICS, LB_BOUNDARIES_GPU, EXTERNAL_FORCES, ELECTROSTATICS
 EK_BOUNDARIES                   requires CUDA
 EK_DEBUG                        requires ELECTROKINETICS
-EK_DOUBLE_PREC                  requires ELECTROKINETICS
 
 /* Interaction features */
 TABULATED

diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp
@@ -33,9 +33,6 @@
 
 #include <memory>
 
-// This needs to be done in the .cu file too
-typedef float dds_float;
-
 class DipolarBarnesHut : public Actor {
 public:
   DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {

diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu
@@ -36,12 +36,8 @@
 
 #include <cuda.h>
 
-typedef float dds_float;
-
 #define IND (blockDim.x * blockIdx.x + threadIdx.x)
 
-using namespace std;
-
 // Method performance/accuracy parameters
 __constant__ float epssqd[1], itolsqd[1];
 // blkcntd is a factual blocks' count.
@@ -57,7 +53,7 @@ __device__ __constant__ volatile BHData bhpara[1];
 // The "half-convolution" multi-thread reduction.
 // The thread with a lower index will operate longer and
 // final result (here: the sum) is flowing down towards zero thread.
-__device__ void dds_sumReduction_BH(dds_float *input, dds_float *sum) {
+__device__ void dds_sumReduction_BH(float *input, float *sum) {
   auto tid = static_cast<int>(threadIdx.x);
   for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
     __syncthreads();
@@ -690,7 +686,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
-    dds_float pf, float *force, float *torque) {
+    float pf, float *force, float *torque) {
   int i, j, k, l, n, depth, base, sbase, diff, t;
   float tmp;
   // dr is a distance between particles.
@@ -902,7 +898,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
 /******************************************************************************/
 
 __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
-    dds_float pf, dds_float *energySum) {
+    float pf, float *energySum) {
   // NOTE: the algorithm of this kernel is almost identical to
   // forceCalculationKernel. See comments there.
 
@@ -912,8 +908,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
   __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
       node[MAXDEPTH * THREADS5 / WARPSIZE];
   __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
-  dds_float sum = 0.0;
-  extern __shared__ dds_float res[];
+  float sum = 0.0;
+  extern __shared__ float res[];
 
   float b, d1, dd5;
 
@@ -1120,7 +1116,7 @@ void sortBH(int blocks) {
 }
 
 // Force calculation.
-int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
+int forceBH(BHData *bh_data, float k, float *f, float *torque) {
   int error_code = 0;
   dim3 grid(1, 1, 1);
   dim3 block(1, 1, 1);
@@ -1137,26 +1133,26 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
 }
 
 // Energy calculation.
-int energyBH(BHData *bh_data, dds_float k, float *E) {
+int energyBH(BHData *bh_data, float k, float *E) {
   int error_code = 0;
   dim3 grid(1, 1, 1);
   dim3 block(1, 1, 1);
 
   grid.x = bh_data->blocks * FACTOR5;
   block.x = THREADS5;
 
-  dds_float *energySum;
-  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
+  float *energySum;
+  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
   // cleanup the memory for the energy sum
-  cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
+  cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x)));
 
   KERNELCALL_shared(energyCalculationKernel, grid, block,
-                    block.x * sizeof(dds_float), k, energySum);
+                    block.x * sizeof(float), k, energySum);
   cuda_safe_mem(cudaDeviceSynchronize());
 
   // Sum the results of all blocks
   // One energy part per block in the prev kernel
-  thrust::device_ptr<dds_float> t(energySum);
+  thrust::device_ptr<float> t(energySum);
   float x = thrust::reduce(t, t + grid.x);
   cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
 

diff --git a/src/core/actor/DipolarBarnesHut_cuda.cuh b/src/core/actor/DipolarBarnesHut_cuda.cuh
@@ -25,8 +25,6 @@
 
 #ifdef DIPOLAR_BARNES_HUT
 
-typedef float dds_float;
-
 typedef struct {
   /// CUDA blocks
   int blocks;
@@ -117,10 +115,10 @@ void summarizeBH(int blocks);
 void sortBH(int blocks);
 
 /// Barnes-Hut force calculation.
-int forceBH(BHData *bh_data, dds_float k, float *f, float *torque);
+int forceBH(BHData *bh_data, float k, float *f, float *torque);
 
 /// Barnes-Hut energy calculation.
-int energyBH(BHData *bh_data, dds_float k, float *E);
+int energyBH(BHData *bh_data, float k, float *E);
 
 #endif // DIPOLAR_BARNES_HUT
 #endif /* DIPOLARBARNESHUT_CUH_ */
diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp
@@ -32,15 +32,12 @@
 
 #include <memory>
 
-// This needs to be done in the .cu file too!!!!!
-typedef float dds_float;
-
-void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
-                                            float *dip, dds_float box_l[3],
+void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
+                                            float *dip, float box_l[3],
                                             int periodic[3], float *E);
-void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
+void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
                                            float *dip, float *f, float *torque,
-                                           dds_float box_l[3], int periodic[3]);
+                                           float box_l[3], int periodic[3]);
 
 class DipolarDirectSum : public Actor {
 public:
@@ -58,21 +55,21 @@ class DipolarDirectSum : public Actor {
       runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
   };
   void computeForces(SystemInterface &s) override {
-    dds_float box[3];
+    float box[3];
     int per[3];
     for (int i = 0; i < 3; i++) {
-      box[i] = static_cast<dds_float>(s.box()[i]);
+      box[i] = static_cast<float>(s.box()[i]);
       per[i] = (box_geo.periodic(i));
     }
     DipolarDirectSum_kernel_wrapper_force(
         k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
         s.fGpuBegin(), s.torqueGpuBegin(), box, per);
   };
   void computeEnergy(SystemInterface &s) override {
-    dds_float box[3];
+    float box[3];
     int per[3];
     for (int i = 0; i < 3; i++) {
-      box[i] = static_cast<dds_float>(s.box()[i]);
+      box[i] = static_cast<float>(s.box()[i]);
       per[i] = (box_geo.periodic(i));
     }
     DipolarDirectSum_kernel_wrapper_energy(

diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu
@@ -32,12 +32,8 @@
 #error CU-file includes mpi.h! This should not happen!
 #endif
 
-// This needs to be done in the .cu file too!!!!!
-typedef float dds_float;
-
-__device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],
-                                         dds_float const b[3],
-                                         dds_float const box_l[3],
+__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
+                                         float const b[3], float const box_l[3],
                                          int const periodic[3]) {
   for (int i = 0; i < 3; i++) {
     res[i] = a[i] - b[i];
@@ -48,19 +44,19 @@ __device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],
 
 #define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2])
 
-__device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
+__device__ void dipole_ia_force(int id, float pf, float const *r1,
                                 float const *r2, float const *dip1,
-                                float const *dip2, dds_float *f1,
-                                dds_float *torque1, dds_float *torque2,
-                                dds_float box_l[3], int periodic[3]) {
-  dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv,
-      a, b, cc, d, ab;
+                                float const *dip2, float *f1, float *torque1,
+                                float *torque2, float box_l[3],
+                                int periodic[3]) {
+  float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b,
+      cc, d, ab;
 #ifdef ROTATION
-  dds_float bx, by, bz, ax, ay, az;
+  float bx, by, bz, ax, ay, az;
 #endif
 
-  dds_float dr[3];
-  dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
+  float dr[3];
+  float _r1[3], _r2[3], _dip1[3], _dip2[3];
 
   for (int i = 0; i < 3; i++) {
     _r1[i] = r1[i];
@@ -121,13 +117,13 @@ __device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
 #endif
 }
 
-__device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
-                                      float const *r2, float const *dip1,
-                                      float const *dip2, dds_float box_l[3],
-                                      int periodic[3]) {
-  dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
-  dds_float dr[3];
-  dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
+__device__ float dipole_ia_energy(int id, float pf, float const *r1,
+                                  float const *r2, float const *dip1,
+                                  float const *dip2, float box_l[3],
+                                  int periodic[3]) {
+  float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
+  float dr[3];
+  float _r1[3], _r2[3], _dip1[3], _dip2[3];
 
   for (int i = 0; i < 3; i++) {
     _r1[i] = r1[i];
@@ -156,9 +152,9 @@ __device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
   return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
 }
 
-__global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
+__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
                                               float *dip, float *f,
-                                              float *torque, dds_float box_l[3],
+                                              float *torque, float box_l[3],
                                               int periodic[3]) {
 
   auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
@@ -168,10 +164,10 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
 
   // Kahan summation based on the wikipedia article
   // Force
-  dds_float fi[3], fsum[3], tj[3];
+  float fi[3], fsum[3], tj[3];
 
   // Torque
-  dds_float ti[3], tsum[3];
+  float ti[3], tsum[3];
 
   // There is one thread per particle. Each thread computes interactions
   // with particles whose id is smaller than the thread id.
@@ -207,7 +203,7 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
   }
 }
 
-__device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
+__device__ void dds_sumReduction(float *input, float *sum) {
   auto tid = static_cast<int>(threadIdx.x);
   for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
     __syncthreads();
@@ -222,14 +218,14 @@ __device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
   }
 }
 
-__global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
-                                               float *dip, dds_float box_l[3],
+__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
+                                               float *dip, float box_l[3],
                                                int periodic[3],
-                                               dds_float *energySum) {
+                                               float *energySum) {
 
   auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
-  dds_float sum = 0.0;
-  extern __shared__ dds_float res[];
+  float sum = 0.0;
+  extern __shared__ float res[];
 
   // There is one thread per particle. Each thread computes interactions
   // with particles whose id is larger than the thread id.
@@ -253,10 +249,9 @@ __global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
   dds_sumReduction(res, &(energySum[blockIdx.x]));
 }
 
-void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
+void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
                                            float *dip, float *f, float *torque,
-                                           dds_float box_l[3],
-                                           int periodic[3]) {
+                                           float box_l[3], int periodic[3]) {
 
   const int bs = 64;
   dim3 grid(1, 1, 1);
@@ -273,12 +268,12 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
     block.x = bs;
   }
 
-  dds_float *box_l_gpu;
+  float *box_l_gpu;
   int *periodic_gpu;
-  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
+  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
   cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
-  cuda_safe_mem(cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float),
-                           cudaMemcpyHostToDevice));
+  cuda_safe_mem(
+      cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
   cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
                            cudaMemcpyHostToDevice));
 
@@ -288,8 +283,8 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
   cudaFree(periodic_gpu);
 }
 
-void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
-                                            float *dip, dds_float box_l[3],
+void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
+                                            float *dip, float box_l[3],
                                             int periodic[3], float *E) {
 
   const int bs = 512;
@@ -307,27 +302,27 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
     block.x = bs;
   }
 
-  dds_float *box_l_gpu;
+  float *box_l_gpu;
   int *periodic_gpu;
-  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
+  cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
   cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
   cuda_safe_mem(
       cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
   cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
                            cudaMemcpyHostToDevice));
 
-  dds_float *energySum;
-  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
+  float *energySum;
+  cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
 
   // This will sum the energies up to the block level
   KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block,
-                    bs * sizeof(dds_float), k, n, pos, dip, box_l_gpu,
-                    periodic_gpu, energySum);
+                    bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
+                    energySum);
 
   // Sum the results of all blocks
   // One thread per block in the prev kernel
   // KERNELCALL(sumKernel,1,1,energySum,block.x,E);
-  thrust::device_ptr<dds_float> t(energySum);
+  thrust::device_ptr<float> t(energySum);
   float x = thrust::reduce(t, t + grid.x);
   cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));
Original file line number	Diff line number	Diff line change
Expand Up		@@ -383,8 +383,6 @@ Fluid dynamics and fluid structure interaction

		- ``EK_DEBUG``

		- ``EK_DOUBLE_PREC``


		.. _Interaction features:

Expand Down