Skip to content

Commit

Permalink
cuda: remove typedefs for floating point type. (#4192)
Browse files Browse the repository at this point in the history
  • Loading branch information
kodiakhq[bot] authored Mar 30, 2021
2 parents 90540b1 + d0e50f2 commit bf8febb
Show file tree
Hide file tree
Showing 18 changed files with 358 additions and 420 deletions.
2 changes: 0 additions & 2 deletions doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,6 @@ Fluid dynamics and fluid structure interaction
- ``EK_DEBUG``
- ``EK_DOUBLE_PREC``
.. _Interaction features:
Expand Down
1 change: 0 additions & 1 deletion src/config/features.def
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ ELECTROKINETICS requires CUDA
EK_BOUNDARIES implies ELECTROKINETICS, LB_BOUNDARIES_GPU, EXTERNAL_FORCES, ELECTROSTATICS
EK_BOUNDARIES requires CUDA
EK_DEBUG requires ELECTROKINETICS
EK_DOUBLE_PREC requires ELECTROKINETICS

/* Interaction features */
TABULATED
Expand Down
3 changes: 0 additions & 3 deletions src/core/actor/DipolarBarnesHut.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@

#include <memory>

// This needs to be done in the .cu file too
typedef float dds_float;

class DipolarBarnesHut : public Actor {
public:
DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {
Expand Down
28 changes: 12 additions & 16 deletions src/core/actor/DipolarBarnesHut_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@

#include <cuda.h>

typedef float dds_float;

#define IND (blockDim.x * blockIdx.x + threadIdx.x)

using namespace std;

// Method performance/accuracy parameters
__constant__ float epssqd[1], itolsqd[1];
// blkcntd is a factual blocks' count.
Expand All @@ -57,7 +53,7 @@ __device__ __constant__ volatile BHData bhpara[1];
// The "half-convolution" multi-thread reduction.
// The thread with a lower index will operate longer and
// final result (here: the sum) is flowing down towards zero thread.
__device__ void dds_sumReduction_BH(dds_float *input, dds_float *sum) {
__device__ void dds_sumReduction_BH(float *input, float *sum) {
auto tid = static_cast<int>(threadIdx.x);
for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
__syncthreads();
Expand Down Expand Up @@ -690,7 +686,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
/******************************************************************************/

__global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
dds_float pf, float *force, float *torque) {
float pf, float *force, float *torque) {
int i, j, k, l, n, depth, base, sbase, diff, t;
float tmp;
// dr is a distance between particles.
Expand Down Expand Up @@ -902,7 +898,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
/******************************************************************************/

__global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
dds_float pf, dds_float *energySum) {
float pf, float *energySum) {
// NOTE: the algorithm of this kernel is almost identical to
// forceCalculationKernel. See comments there.

Expand All @@ -912,8 +908,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
__shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
node[MAXDEPTH * THREADS5 / WARPSIZE];
__shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
dds_float sum = 0.0;
extern __shared__ dds_float res[];
float sum = 0.0;
extern __shared__ float res[];

float b, d1, dd5;

Expand Down Expand Up @@ -1120,7 +1116,7 @@ void sortBH(int blocks) {
}

// Force calculation.
int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
int forceBH(BHData *bh_data, float k, float *f, float *torque) {
int error_code = 0;
dim3 grid(1, 1, 1);
dim3 block(1, 1, 1);
Expand All @@ -1137,26 +1133,26 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
}

// Energy calculation.
int energyBH(BHData *bh_data, dds_float k, float *E) {
int energyBH(BHData *bh_data, float k, float *E) {
int error_code = 0;
dim3 grid(1, 1, 1);
dim3 block(1, 1, 1);

grid.x = bh_data->blocks * FACTOR5;
block.x = THREADS5;

dds_float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
// cleanup the memory for the energy sum
cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x)));

KERNELCALL_shared(energyCalculationKernel, grid, block,
block.x * sizeof(dds_float), k, energySum);
block.x * sizeof(float), k, energySum);
cuda_safe_mem(cudaDeviceSynchronize());

// Sum the results of all blocks
// One energy part per block in the prev kernel
thrust::device_ptr<dds_float> t(energySum);
thrust::device_ptr<float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));

Expand Down
6 changes: 2 additions & 4 deletions src/core/actor/DipolarBarnesHut_cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@

#ifdef DIPOLAR_BARNES_HUT

typedef float dds_float;

typedef struct {
/// CUDA blocks
int blocks;
Expand Down Expand Up @@ -117,10 +115,10 @@ void summarizeBH(int blocks);
void sortBH(int blocks);

/// Barnes-Hut force calculation.
int forceBH(BHData *bh_data, dds_float k, float *f, float *torque);
int forceBH(BHData *bh_data, float k, float *f, float *torque);

/// Barnes-Hut energy calculation.
int energyBH(BHData *bh_data, dds_float k, float *E);
int energyBH(BHData *bh_data, float k, float *E);

#endif // DIPOLAR_BARNES_HUT
#endif /* DIPOLARBARNESHUT_CUH_ */
19 changes: 8 additions & 11 deletions src/core/actor/DipolarDirectSum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,12 @@

#include <memory>

// This needs to be done in the .cu file too!!!!!
typedef float dds_float;

void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
float *dip, dds_float box_l[3],
void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
float *dip, float box_l[3],
int periodic[3], float *E);
void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
float *dip, float *f, float *torque,
dds_float box_l[3], int periodic[3]);
float box_l[3], int periodic[3]);

class DipolarDirectSum : public Actor {
public:
Expand All @@ -58,21 +55,21 @@ class DipolarDirectSum : public Actor {
runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
};
void computeForces(SystemInterface &s) override {
dds_float box[3];
float box[3];
int per[3];
for (int i = 0; i < 3; i++) {
box[i] = static_cast<dds_float>(s.box()[i]);
box[i] = static_cast<float>(s.box()[i]);
per[i] = (box_geo.periodic(i));
}
DipolarDirectSum_kernel_wrapper_force(
k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
s.fGpuBegin(), s.torqueGpuBegin(), box, per);
};
void computeEnergy(SystemInterface &s) override {
dds_float box[3];
float box[3];
int per[3];
for (int i = 0; i < 3; i++) {
box[i] = static_cast<dds_float>(s.box()[i]);
box[i] = static_cast<float>(s.box()[i]);
per[i] = (box_geo.periodic(i));
}
DipolarDirectSum_kernel_wrapper_energy(
Expand Down
91 changes: 43 additions & 48 deletions src/core/actor/DipolarDirectSum_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,8 @@
#error CU-file includes mpi.h! This should not happen!
#endif

// This needs to be done in the .cu file too!!!!!
typedef float dds_float;

__device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],
dds_float const b[3],
dds_float const box_l[3],
__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
float const b[3], float const box_l[3],
int const periodic[3]) {
for (int i = 0; i < 3; i++) {
res[i] = a[i] - b[i];
Expand All @@ -48,19 +44,19 @@ __device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],

#define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2])

__device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
__device__ void dipole_ia_force(int id, float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, dds_float *f1,
dds_float *torque1, dds_float *torque2,
dds_float box_l[3], int periodic[3]) {
dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv,
a, b, cc, d, ab;
float const *dip2, float *f1, float *torque1,
float *torque2, float box_l[3],
int periodic[3]) {
float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b,
cc, d, ab;
#ifdef ROTATION
dds_float bx, by, bz, ax, ay, az;
float bx, by, bz, ax, ay, az;
#endif

dds_float dr[3];
dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
float dr[3];
float _r1[3], _r2[3], _dip1[3], _dip2[3];

for (int i = 0; i < 3; i++) {
_r1[i] = r1[i];
Expand Down Expand Up @@ -121,13 +117,13 @@ __device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
#endif
}

__device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, dds_float box_l[3],
int periodic[3]) {
dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
dds_float dr[3];
dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
__device__ float dipole_ia_energy(int id, float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, float box_l[3],
int periodic[3]) {
float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
float dr[3];
float _r1[3], _r2[3], _dip1[3], _dip2[3];

for (int i = 0; i < 3; i++) {
_r1[i] = r1[i];
Expand Down Expand Up @@ -156,9 +152,9 @@ __device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
}

__global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
float *dip, float *f,
float *torque, dds_float box_l[3],
float *torque, float box_l[3],
int periodic[3]) {

auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
Expand All @@ -168,10 +164,10 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,

// Kahan summation based on the wikipedia article
// Force
dds_float fi[3], fsum[3], tj[3];
float fi[3], fsum[3], tj[3];

// Torque
dds_float ti[3], tsum[3];
float ti[3], tsum[3];

// There is one thread per particle. Each thread computes interactions
// with particles whose id is smaller than the thread id.
Expand Down Expand Up @@ -207,7 +203,7 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
}
}

__device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
__device__ void dds_sumReduction(float *input, float *sum) {
auto tid = static_cast<int>(threadIdx.x);
for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
__syncthreads();
Expand All @@ -222,14 +218,14 @@ __device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
}
}

__global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
float *dip, dds_float box_l[3],
__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
float *dip, float box_l[3],
int periodic[3],
dds_float *energySum) {
float *energySum) {

auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
dds_float sum = 0.0;
extern __shared__ dds_float res[];
float sum = 0.0;
extern __shared__ float res[];

// There is one thread per particle. Each thread computes interactions
// with particles whose id is larger than the thread id.
Expand All @@ -253,10 +249,9 @@ __global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
dds_sumReduction(res, &(energySum[blockIdx.x]));
}

void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
float *dip, float *f, float *torque,
dds_float box_l[3],
int periodic[3]) {
float box_l[3], int periodic[3]) {

const int bs = 64;
dim3 grid(1, 1, 1);
Expand All @@ -273,12 +268,12 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
block.x = bs;
}

dds_float *box_l_gpu;
float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float),
cudaMemcpyHostToDevice));
cuda_safe_mem(
cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));

Expand All @@ -288,8 +283,8 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
cudaFree(periodic_gpu);
}

void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
float *dip, dds_float box_l[3],
void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
float *dip, float box_l[3],
int periodic[3], float *E) {

const int bs = 512;
Expand All @@ -307,27 +302,27 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
block.x = bs;
}

dds_float *box_l_gpu;
float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(
cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));

dds_float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));

// This will sum the energies up to the block level
KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block,
bs * sizeof(dds_float), k, n, pos, dip, box_l_gpu,
periodic_gpu, energySum);
bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
energySum);

// Sum the results of all blocks
// One thread per block in the prev kernel
// KERNELCALL(sumKernel,1,1,energySum,block.x,E);
thrust::device_ptr<dds_float> t(energySum);
thrust::device_ptr<float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));

Expand Down
Loading

0 comments on commit bf8febb

Please sign in to comment.