Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda: remove typedefs for floating point type. #4192

Merged
merged 5 commits into from
Mar 30, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,6 @@ Fluid dynamics and fluid structure interaction

- ``EK_DEBUG``

- ``EK_DOUBLE_PREC``


.. _Interaction features:

Expand Down
1 change: 0 additions & 1 deletion src/config/features.def
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ ELECTROKINETICS requires CUDA
EK_BOUNDARIES implies ELECTROKINETICS, LB_BOUNDARIES_GPU, EXTERNAL_FORCES, ELECTROSTATICS
EK_BOUNDARIES requires CUDA
EK_DEBUG requires ELECTROKINETICS
EK_DOUBLE_PREC requires ELECTROKINETICS

/* Interaction features */
TABULATED
Expand Down
3 changes: 0 additions & 3 deletions src/core/actor/DipolarBarnesHut.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@

#include <memory>

// This needs to be done in the .cu file too
typedef float dds_float;

class DipolarBarnesHut : public Actor {
public:
DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) {
Expand Down
28 changes: 12 additions & 16 deletions src/core/actor/DipolarBarnesHut_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@

#include <cuda.h>

typedef float dds_float;

#define IND (blockDim.x * blockIdx.x + threadIdx.x)

using namespace std;

// Method performance/accuracy parameters
__constant__ float epssqd[1], itolsqd[1];
// blkcntd is a factual blocks' count.
Expand All @@ -57,7 +53,7 @@ __device__ __constant__ volatile BHData bhpara[1];
// The "half-convolution" multi-thread reduction.
// The thread with a lower index will operate longer and
// final result (here: the sum) is flowing down towards zero thread.
__device__ void dds_sumReduction_BH(dds_float *input, dds_float *sum) {
__device__ void dds_sumReduction_BH(float *input, float *sum) {
auto tid = static_cast<int>(threadIdx.x);
for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
__syncthreads();
Expand Down Expand Up @@ -690,7 +686,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() {
/******************************************************************************/

__global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
dds_float pf, float *force, float *torque) {
float pf, float *force, float *torque) {
int i, j, k, l, n, depth, base, sbase, diff, t;
float tmp;
// dr is a distance between particles.
Expand Down Expand Up @@ -902,7 +898,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel(
/******************************************************************************/

__global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
dds_float pf, dds_float *energySum) {
float pf, float *energySum) {
// NOTE: the algorithm of this kernel is almost identical to
// forceCalculationKernel. See comments there.

Expand All @@ -912,8 +908,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel(
__shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE],
node[MAXDEPTH * THREADS5 / WARPSIZE];
__shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE];
dds_float sum = 0.0;
extern __shared__ dds_float res[];
float sum = 0.0;
extern __shared__ float res[];

float b, d1, dd5;

Expand Down Expand Up @@ -1120,7 +1116,7 @@ void sortBH(int blocks) {
}

// Force calculation.
int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
int forceBH(BHData *bh_data, float k, float *f, float *torque) {
int error_code = 0;
dim3 grid(1, 1, 1);
dim3 block(1, 1, 1);
Expand All @@ -1137,26 +1133,26 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) {
}

// Energy calculation.
int energyBH(BHData *bh_data, dds_float k, float *E) {
int energyBH(BHData *bh_data, float k, float *E) {
int error_code = 0;
dim3 grid(1, 1, 1);
dim3 block(1, 1, 1);

grid.x = bh_data->blocks * FACTOR5;
block.x = THREADS5;

dds_float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));
// cleanup the memory for the energy sum
cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x)));
cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x)));

KERNELCALL_shared(energyCalculationKernel, grid, block,
block.x * sizeof(dds_float), k, energySum);
block.x * sizeof(float), k, energySum);
cuda_safe_mem(cudaDeviceSynchronize());

// Sum the results of all blocks
// One energy part per block in the prev kernel
thrust::device_ptr<dds_float> t(energySum);
thrust::device_ptr<float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));

Expand Down
6 changes: 2 additions & 4 deletions src/core/actor/DipolarBarnesHut_cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@

#ifdef DIPOLAR_BARNES_HUT

typedef float dds_float;

typedef struct {
/// CUDA blocks
int blocks;
Expand Down Expand Up @@ -117,10 +115,10 @@ void summarizeBH(int blocks);
void sortBH(int blocks);

/// Barnes-Hut force calculation.
int forceBH(BHData *bh_data, dds_float k, float *f, float *torque);
int forceBH(BHData *bh_data, float k, float *f, float *torque);

/// Barnes-Hut energy calculation.
int energyBH(BHData *bh_data, dds_float k, float *E);
int energyBH(BHData *bh_data, float k, float *E);

#endif // DIPOLAR_BARNES_HUT
#endif /* DIPOLARBARNESHUT_CUH_ */
19 changes: 8 additions & 11 deletions src/core/actor/DipolarDirectSum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,12 @@

#include <memory>

// This needs to be done in the .cu file too!!!!!
typedef float dds_float;

void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
float *dip, dds_float box_l[3],
void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
float *dip, float box_l[3],
int periodic[3], float *E);
void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
float *dip, float *f, float *torque,
dds_float box_l[3], int periodic[3]);
float box_l[3], int periodic[3]);

class DipolarDirectSum : public Actor {
public:
Expand All @@ -58,21 +55,21 @@ class DipolarDirectSum : public Actor {
runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!";
};
void computeForces(SystemInterface &s) override {
dds_float box[3];
float box[3];
int per[3];
for (int i = 0; i < 3; i++) {
box[i] = static_cast<dds_float>(s.box()[i]);
box[i] = static_cast<float>(s.box()[i]);
per[i] = (box_geo.periodic(i));
}
DipolarDirectSum_kernel_wrapper_force(
k, static_cast<int>(s.npart_gpu()), s.rGpuBegin(), s.dipGpuBegin(),
s.fGpuBegin(), s.torqueGpuBegin(), box, per);
};
void computeEnergy(SystemInterface &s) override {
dds_float box[3];
float box[3];
int per[3];
for (int i = 0; i < 3; i++) {
box[i] = static_cast<dds_float>(s.box()[i]);
box[i] = static_cast<float>(s.box()[i]);
per[i] = (box_geo.periodic(i));
}
DipolarDirectSum_kernel_wrapper_energy(
Expand Down
91 changes: 43 additions & 48 deletions src/core/actor/DipolarDirectSum_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,8 @@
#error CU-file includes mpi.h! This should not happen!
#endif

// This needs to be done in the .cu file too!!!!!
typedef float dds_float;

__device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],
dds_float const b[3],
dds_float const box_l[3],
__device__ inline void get_mi_vector_dds(float res[3], float const a[3],
float const b[3], float const box_l[3],
int const periodic[3]) {
for (int i = 0; i < 3; i++) {
res[i] = a[i] - b[i];
Expand All @@ -48,19 +44,19 @@ __device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3],

#define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2])

__device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
__device__ void dipole_ia_force(int id, float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, dds_float *f1,
dds_float *torque1, dds_float *torque2,
dds_float box_l[3], int periodic[3]) {
dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv,
a, b, cc, d, ab;
float const *dip2, float *f1, float *torque1,
float *torque2, float box_l[3],
int periodic[3]) {
float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b,
cc, d, ab;
#ifdef ROTATION
dds_float bx, by, bz, ax, ay, az;
float bx, by, bz, ax, ay, az;
#endif

dds_float dr[3];
dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
float dr[3];
float _r1[3], _r2[3], _dip1[3], _dip2[3];

for (int i = 0; i < 3; i++) {
_r1[i] = r1[i];
Expand Down Expand Up @@ -121,13 +117,13 @@ __device__ void dipole_ia_force(int id, dds_float pf, float const *r1,
#endif
}

__device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, dds_float box_l[3],
int periodic[3]) {
dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
dds_float dr[3];
dds_float _r1[3], _r2[3], _dip1[3], _dip2[3];
__device__ float dipole_ia_energy(int id, float pf, float const *r1,
float const *r2, float const *dip1,
float const *dip2, float box_l[3],
int periodic[3]) {
float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv;
float dr[3];
float _r1[3], _r2[3], _dip1[3], _dip2[3];

for (int i = 0; i < 3; i++) {
_r1[i] = r1[i];
Expand Down Expand Up @@ -156,9 +152,9 @@ __device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1,
return pf * (pe1 * r3_inv - pe4 * pe2 * pe3);
}

__global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos,
float *dip, float *f,
float *torque, dds_float box_l[3],
float *torque, float box_l[3],
int periodic[3]) {

auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
Expand All @@ -168,10 +164,10 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,

// Kahan summation based on the wikipedia article
// Force
dds_float fi[3], fsum[3], tj[3];
float fi[3], fsum[3], tj[3];

// Torque
dds_float ti[3], tsum[3];
float ti[3], tsum[3];

// There is one thread per particle. Each thread computes interactions
// with particles whose id is smaller than the thread id.
Expand Down Expand Up @@ -207,7 +203,7 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos,
}
}

__device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
__device__ void dds_sumReduction(float *input, float *sum) {
auto tid = static_cast<int>(threadIdx.x);
for (auto i = static_cast<int>(blockDim.x); i > 1; i /= 2) {
__syncthreads();
Expand All @@ -222,14 +218,14 @@ __device__ void dds_sumReduction(dds_float *input, dds_float *sum) {
}
}

__global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
float *dip, dds_float box_l[3],
__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos,
float *dip, float box_l[3],
int periodic[3],
dds_float *energySum) {
float *energySum) {

auto i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
dds_float sum = 0.0;
extern __shared__ dds_float res[];
float sum = 0.0;
extern __shared__ float res[];

// There is one thread per particle. Each thread computes interactions
// with particles whose id is larger than the thread id.
Expand All @@ -253,10 +249,9 @@ __global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos,
dds_sumReduction(res, &(energySum[blockIdx.x]));
}

void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos,
float *dip, float *f, float *torque,
dds_float box_l[3],
int periodic[3]) {
float box_l[3], int periodic[3]) {

const int bs = 64;
dim3 grid(1, 1, 1);
Expand All @@ -273,12 +268,12 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
block.x = bs;
}

dds_float *box_l_gpu;
float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float),
cudaMemcpyHostToDevice));
cuda_safe_mem(
cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));

Expand All @@ -288,8 +283,8 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos,
cudaFree(periodic_gpu);
}

void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
float *dip, dds_float box_l[3],
void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos,
float *dip, float box_l[3],
int periodic[3], float *E) {

const int bs = 512;
Expand All @@ -307,27 +302,27 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos,
block.x = bs;
}

dds_float *box_l_gpu;
float *box_l_gpu;
int *periodic_gpu;
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float)));
cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float)));
cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int)));
cuda_safe_mem(
cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice));
cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int),
cudaMemcpyHostToDevice));

dds_float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x)));
float *energySum;
cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x)));

// This will sum the energies up to the block level
KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block,
bs * sizeof(dds_float), k, n, pos, dip, box_l_gpu,
periodic_gpu, energySum);
bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu,
energySum);

// Sum the results of all blocks
// One thread per block in the prev kernel
// KERNELCALL(sumKernel,1,1,energySum,block.x,E);
thrust::device_ptr<dds_float> t(energySum);
thrust::device_ptr<float> t(energySum);
float x = thrust::reduce(t, t + grid.x);
cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice));

Expand Down
Loading