From 6c480ec9061d385f360f6691cde5c4d8dada7f8d Mon Sep 17 00:00:00 2001 From: Kai Szuttor Date: Tue, 30 Mar 2021 15:13:02 +0200 Subject: [PATCH 1/4] cuda: remove EK_DOUBLE_PREC. --- doc/sphinx/installation.rst | 2 - src/config/features.def | 1 - .../grid_based_algorithms/electrokinetics.hpp | 18 +- .../electrokinetics_cuda.cu | 252 +++++++++--------- .../grid_based_algorithms/lb_boundaries.cpp | 2 +- src/core/grid_based_algorithms/lbgpu.hpp | 4 - src/python/espressomd/electrokinetics.pxd | 17 +- 7 files changed, 138 insertions(+), 158 deletions(-) diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst index 799894bc8a0..3856908917e 100644 --- a/doc/sphinx/installation.rst +++ b/doc/sphinx/installation.rst @@ -383,8 +383,6 @@ Fluid dynamics and fluid structure interaction - ``EK_DEBUG`` -- ``EK_DOUBLE_PREC`` - .. _Interaction features: diff --git a/src/config/features.def b/src/config/features.def index 73c6cd3a7d7..7a2aa60bd6f 100644 --- a/src/config/features.def +++ b/src/config/features.def @@ -62,7 +62,6 @@ ELECTROKINETICS requires CUDA EK_BOUNDARIES implies ELECTROKINETICS, LB_BOUNDARIES_GPU, EXTERNAL_FORCES, ELECTROSTATICS EK_BOUNDARIES requires CUDA EK_DEBUG requires ELECTROKINETICS -EK_DOUBLE_PREC requires ELECTROKINETICS /* Interaction features */ TABULATED diff --git a/src/core/grid_based_algorithms/electrokinetics.hpp b/src/core/grid_based_algorithms/electrokinetics.hpp index 00110e1e9af..b0d2413ee06 100644 --- a/src/core/grid_based_algorithms/electrokinetics.hpp +++ b/src/core/grid_based_algorithms/electrokinetics.hpp @@ -28,12 +28,6 @@ // elegant than ifdeffing multiple versions of the kernel integrate. #ifdef CUDA -#if defined(EK_DOUBLE_PREC) -typedef double ekfloat; -#else -typedef float ekfloat; -#endif - #define MAX_NUMBER_OF_SPECIES 10 /* Data structure holding parameters and memory pointers for the link flux @@ -76,12 +70,12 @@ typedef struct { float *charge_potential_buffer; float *electric_field; float *charge_potential; - ekfloat *j; + float *j; float *lb_force_density_previous; #ifdef EK_DEBUG - ekfloat *j_fluc; + float *j_fluc; #endif - ekfloat *rho[MAX_NUMBER_OF_SPECIES]; + float *rho[MAX_NUMBER_OF_SPECIES]; int species_index[MAX_NUMBER_OF_SPECIES]; float density[MAX_NUMBER_OF_SPECIES]; float D[MAX_NUMBER_OF_SPECIES]; @@ -181,15 +175,15 @@ int ek_node_print_density(int species, int x, int y, int z, double *density); int ek_node_print_flux(int species, int x, int y, int z, double *flux); int ek_node_print_potential(int x, int y, int z, double *potential); int ek_node_set_density(int species, int x, int y, int z, double density); -ekfloat ek_calculate_net_charge(); +float ek_calculate_net_charge(); int ek_neutralize_system(int species); int ek_save_checkpoint(char *filename, char *lb_filename); int ek_load_checkpoint(char *filename); #ifdef EK_BOUNDARIES -void ek_gather_wallcharge_species_density(ekfloat *wallcharge_species_density, +void ek_gather_wallcharge_species_density(float *wallcharge_species_density, int wallcharge_species); -void ek_init_species_density_wallcharge(ekfloat *wallcharge_species_density, +void ek_init_species_density_wallcharge(float *wallcharge_species_density, int wallcharge_species); #endif diff --git a/src/core/grid_based_algorithms/electrokinetics_cuda.cu b/src/core/grid_based_algorithms/electrokinetics_cuda.cu index 119d828d0f4..49169fadb19 100644 --- a/src/core/grid_based_algorithms/electrokinetics_cuda.cu +++ b/src/core/grid_based_algorithms/electrokinetics_cuda.cu @@ -174,7 +174,7 @@ EK_parameters ek_parameters = { }; __device__ __constant__ EK_parameters ek_parameters_gpu[1]; -ekfloat *charge_gpu; +float *charge_gpu; LB_parameters_gpu *ek_lbparameters_gpu; CUDA_particle_data *particle_data_gpu; float *ek_lb_boundary_force; @@ -309,7 +309,7 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( unsigned int const *neighborindex, unsigned int const *neighborindex_padded, unsigned int species_index, LB_node_force_density_gpu node_f, LB_nodes_gpu lb_node) { - ekfloat flux, force; + float flux, force; float agrid_inv = 1.0f / ek_parameters_gpu->agrid; float sqrt2agrid_inv = 1.0f / (sqrtf(2.0f) * ek_parameters_gpu->agrid); @@ -339,8 +339,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U00]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U00]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U00)], flux * ek_parameters_gpu->time_step); @@ -388,8 +388,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0U0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0U0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0U0)], flux * ek_parameters_gpu->time_step); @@ -442,8 +442,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_00U]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_00U]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_00U)], flux * ek_parameters_gpu->time_step); @@ -498,8 +498,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_UU0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_UU0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_UU0)], flux * ek_parameters_gpu->time_step); @@ -537,8 +537,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_UD0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_UD0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_UD0)], flux * ek_parameters_gpu->time_step); @@ -577,8 +577,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U0U]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U0U]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U0U)], flux * ek_parameters_gpu->time_step); @@ -616,8 +616,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U0D]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U0D]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U0D)], flux * ek_parameters_gpu->time_step); @@ -656,8 +656,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0UU]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0UU]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0UU)], flux * ek_parameters_gpu->time_step); @@ -695,8 +695,8 @@ __device__ void ek_diffusion_migration_lbforce_linkcentered_stencil( flux *= ek_parameters_gpu->d[species_index] * agrid_inv; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0UD]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0UD]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0UD)], flux * ek_parameters_gpu->time_step); @@ -719,7 +719,7 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( unsigned int const *neighborindex, unsigned int const *neighborindex_padded, unsigned int species_index, LB_node_force_density_gpu node_f, LB_nodes_gpu lb_node) { - ekfloat flux, force; + float flux, force; // face in x flux = (ek_parameters_gpu->rho[species_index][index] - @@ -736,16 +736,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_U00]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U00]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U00]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U00)], flux * ek_parameters_gpu->time_step); @@ -774,16 +774,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_0U0]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0U0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0U0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0U0)], flux * ek_parameters_gpu->time_step); @@ -812,16 +812,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_00U]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_00U]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_00U]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_00U)], flux * ek_parameters_gpu->time_step); @@ -852,16 +852,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_UU0]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_UU0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_UU0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_UU0)], flux * ek_parameters_gpu->time_step); @@ -893,16 +893,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_UD0]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_UD0]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_UD0]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_UD0)], flux * ek_parameters_gpu->time_step); @@ -936,16 +936,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_U0U]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U0U]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U0U]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U0U)], flux * ek_parameters_gpu->time_step); @@ -977,16 +977,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_U0D]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_U0D]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_U0D]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_U0D)], flux * ek_parameters_gpu->time_step); @@ -1020,16 +1020,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_0UU]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0UU]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0UU]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0UU)], flux * ek_parameters_gpu->time_step); @@ -1061,16 +1061,16 @@ __device__ void ek_diffusion_migration_lbforce_nodecentered_stencil( flux += force * - (static_cast(force >= 0.0f) * + (static_cast(force >= 0.0f) * ek_parameters_gpu->rho[species_index][index] + - static_cast(force < 0.0f) * + static_cast(force < 0.0f) * ek_parameters_gpu->rho[species_index][neighborindex[EK_LINK_0UD]]) / ek_parameters_gpu->T; flux *= ek_parameters_gpu->d[species_index] / ek_parameters_gpu->agrid; - flux *= static_cast(!(lb_node.boundary[index] || - lb_node.boundary[neighborindex[EK_LINK_0UD]])); + flux *= static_cast(!(lb_node.boundary[index] || + lb_node.boundary[neighborindex[EK_LINK_0UD]])); atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(index, EK_LINK_0UD)], flux * ek_parameters_gpu->time_step); @@ -1127,10 +1127,10 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, (lb_node.boundary[index] || lb_node.boundary[target_node_index]) == 0; atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(node, EK_LINK_U00)], - (2 * static_cast(di[0]) - 1) * + (2 * static_cast(di[0]) - 1) * ek_parameters_gpu->rho[species_index][index] * dx[0] * (1.0f - dx[1]) * (1.0f - dx[2]) * - static_cast(not_boundary)); + static_cast(not_boundary)); // face in y node = rhoindex_cartesian2linear( @@ -1149,9 +1149,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, (lb_node.boundary[index] || lb_node.boundary[target_node_index]) == 0; atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(node, EK_LINK_0U0)], - (2 * static_cast(di[1]) - 1) * + (2 * static_cast(di[1]) - 1) * ek_parameters_gpu->rho[species_index][index] * (1.0f - dx[0]) * - dx[1] * (1.0f - dx[2]) * static_cast(not_boundary)); + dx[1] * (1.0f - dx[2]) * static_cast(not_boundary)); // face in z node = rhoindex_cartesian2linear( @@ -1169,9 +1169,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, (lb_node.boundary[index] || lb_node.boundary[target_node_index]) == 0; atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear(node, EK_LINK_00U)], - (2 * static_cast(di[2]) - 1) * + (2 * static_cast(di[2]) - 1) * ek_parameters_gpu->rho[species_index][index] * (1.0f - dx[0]) * - (1.0f - dx[1]) * dx[2] * static_cast(not_boundary)); + (1.0f - dx[1]) * dx[2] * static_cast(not_boundary)); // edge in x node = rhoindex_cartesian2linear( @@ -1194,9 +1194,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, atomicAdd( &ek_parameters_gpu ->j[jindex_getByRhoLinear(node, EK_LINK_0UU + (di[1] + di[2] == 1))], - (2 * static_cast(di[1]) - 1) * + (2 * static_cast(di[1]) - 1) * ek_parameters_gpu->rho[species_index][index] * (1.0f - dx[0]) * - dx[1] * dx[2] * static_cast(not_boundary)); + dx[1] * dx[2] * static_cast(not_boundary)); // edge in y node = rhoindex_cartesian2linear( @@ -1219,9 +1219,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, atomicAdd( &ek_parameters_gpu ->j[jindex_getByRhoLinear(node, EK_LINK_U0U + (di[0] + di[2] == 1))], - (2 * static_cast(di[0]) - 1) * + (2 * static_cast(di[0]) - 1) * ek_parameters_gpu->rho[species_index][index] * dx[0] * - (1.0f - dx[1]) * dx[2] * static_cast(not_boundary)); + (1.0f - dx[1]) * dx[2] * static_cast(not_boundary)); // edge in z node = rhoindex_cartesian2linear( @@ -1244,9 +1244,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, atomicAdd( &ek_parameters_gpu ->j[jindex_getByRhoLinear(node, EK_LINK_UU0 + (di[0] + di[1] == 1))], - (2 * static_cast(di[0]) - 1) * + (2 * static_cast(di[0]) - 1) * ek_parameters_gpu->rho[species_index][index] * dx[0] * dx[1] * - (1.0f - dx[2]) * static_cast(not_boundary)); + (1.0f - dx[2]) * static_cast(not_boundary)); // corner node = rhoindex_cartesian2linear( @@ -1271,9 +1271,9 @@ ek_add_advection_to_flux(unsigned int index, unsigned int *neighborindex, atomicAdd(&ek_parameters_gpu->j[jindex_getByRhoLinear( node, (1 - di[0]) * (EK_LINK_UUU + 2 * di[1] + di[2]) + di[0] * (EK_LINK_UDD - 2 * di[1] - di[2]))], - (2 * static_cast(di[0]) - 1) * + (2 * static_cast(di[0]) - 1) * ek_parameters_gpu->rho[species_index][index] * dx[0] * dx[1] * - dx[2] * static_cast(not_boundary)); + dx[2] * static_cast(not_boundary)); } __device__ float4 ek_random_wrapper_philox(unsigned int index, @@ -2041,7 +2041,7 @@ __global__ void ek_clear_boundary_densities(LB_nodes_gpu lbnode) { } } -__global__ void ek_calculate_system_charge(ekfloat *charge_gpu) { +__global__ void ek_calculate_system_charge(float *charge_gpu) { unsigned int index = ek_getThreadIndex(); @@ -2153,16 +2153,15 @@ void ek_integrate() { } #ifdef EK_BOUNDARIES -void ek_gather_wallcharge_species_density(ekfloat *wallcharge_species_density, +void ek_gather_wallcharge_species_density(float *wallcharge_species_density, int wallcharge_species) { if (wallcharge_species != -1) { - cuda_safe_mem(cudaMemcpy(wallcharge_species_density, - ek_parameters.rho[wallcharge_species], - ek_parameters.number_of_nodes * sizeof(ekfloat), - cudaMemcpyDeviceToHost)); + cuda_safe_mem(cudaMemcpy( + wallcharge_species_density, ek_parameters.rho[wallcharge_species], + ek_parameters.number_of_nodes * sizeof(float), cudaMemcpyDeviceToHost)); } } -void ek_init_species_density_wallcharge(ekfloat *wallcharge_species_density, +void ek_init_species_density_wallcharge(float *wallcharge_species_density, int wallcharge_species) { dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -2171,10 +2170,9 @@ void ek_init_species_density_wallcharge(ekfloat *wallcharge_species_density, *current_nodes); if (wallcharge_species != -1) { - cuda_safe_mem(cudaMemcpy(ek_parameters.rho[wallcharge_species], - wallcharge_species_density, - ek_parameters.number_of_nodes * sizeof(ekfloat), - cudaMemcpyHostToDevice)); + cuda_safe_mem(cudaMemcpy( + ek_parameters.rho[wallcharge_species], wallcharge_species_density, + ek_parameters.number_of_nodes * sizeof(float), cudaMemcpyHostToDevice)); } } #endif @@ -2191,7 +2189,7 @@ void ek_init_species(int species) { cuda_safe_mem(cudaMalloc( (void **)&ek_parameters.rho[ek_parameters.species_index[species]], - ek_parameters.number_of_nodes * sizeof(ekfloat))); + ek_parameters.number_of_nodes * sizeof(float))); ek_parameters.density[ek_parameters.species_index[species]] = 0.0; ek_parameters.D[ek_parameters.species_index[species]] = 0.0; @@ -2291,11 +2289,11 @@ int ek_init() { cuda_safe_mem( cudaMalloc((void **)&ek_parameters.j, - ek_parameters.number_of_nodes * 13 * sizeof(ekfloat))); + ek_parameters.number_of_nodes * 13 * sizeof(float))); #ifdef EK_DEBUG cuda_safe_mem( cudaMalloc((void **)&ek_parameters.j_fluc, - ek_parameters.number_of_nodes * 13 * sizeof(ekfloat))); + ek_parameters.number_of_nodes * 13 * sizeof(float))); #endif cuda_safe_mem(cudaMemcpyToSymbol(ek_parameters_gpu, &ek_parameters, @@ -2317,7 +2315,7 @@ int ek_init() { ek_parameters.number_of_nodes * 3 * sizeof(float))); } - cuda_safe_mem(cudaMalloc((void **)&charge_gpu, sizeof(ekfloat))); + cuda_safe_mem(cudaMalloc((void **)&charge_gpu, sizeof(float))); lb_get_device_values_pointer(&ek_lb_device_values); @@ -2552,11 +2550,11 @@ int ek_print_vtk_density(int species, char *filename) { return 1; } - std::vector densities(ek_parameters.number_of_nodes); + std::vector densities(ek_parameters.number_of_nodes); cuda_safe_mem(cudaMemcpy( densities.data(), ek_parameters.rho[ek_parameters.species_index[species]], - densities.size() * sizeof(ekfloat), cudaMemcpyDeviceToHost)); + densities.size() * sizeof(float), cudaMemcpyDeviceToHost)); fprintf(fp, "\ # vtk DataFile Version 2.0\n\ @@ -2592,11 +2590,11 @@ int ek_node_print_density(int species, int x, int y, int z, double *density) { return 1; } - std::vector densities(ek_parameters.number_of_nodes); + std::vector densities(ek_parameters.number_of_nodes); cuda_safe_mem(cudaMemcpy( densities.data(), ek_parameters.rho[ek_parameters.species_index[species]], - densities.size() * sizeof(ekfloat), cudaMemcpyDeviceToHost)); + densities.size() * sizeof(float), cudaMemcpyDeviceToHost)); auto const index = static_cast(z) * ek_parameters.dim_y * ek_parameters.dim_x + @@ -2612,15 +2610,15 @@ int ek_node_print_flux(int species, int x, int y, int z, double *flux) { return 1; } - ekfloat flux_local_cartesian[3]; // temporary variable for converting fluxes - // into Cartesian coordinates for output + float flux_local_cartesian[3]; // temporary variable for converting fluxes + // into Cartesian coordinates for output unsigned int coord[3]; coord[0] = static_cast(x); coord[1] = static_cast(y); coord[2] = static_cast(z); - std::vector fluxes(ek_parameters.number_of_nodes * 13); + std::vector fluxes(ek_parameters.number_of_nodes * 13); dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -2639,7 +2637,7 @@ int ek_node_print_flux(int species, int x, int y, int z, double *flux) { #endif cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j, - fluxes.size() * sizeof(ekfloat), + fluxes.size() * sizeof(float), cudaMemcpyDeviceToHost)); auto const i = rhoindex_cartesian2linear_host(coord[0], coord[1], coord[2]); @@ -2816,12 +2814,12 @@ int ek_node_set_density(int species, int x, int y, int z, double density) { auto const index = static_cast(z) * ek_parameters.dim_y * ek_parameters.dim_x + static_cast(y) * ek_parameters.dim_x + static_cast(x); - ekfloat num_particles = - static_cast(density) * Utils::int_pow<3>(ek_parameters.agrid); + float num_particles = + static_cast(density) * Utils::int_pow<3>(ek_parameters.agrid); cuda_safe_mem(cudaMemcpy( &ek_parameters.rho[ek_parameters.species_index[species]][index], - &num_particles, sizeof(ekfloat), cudaMemcpyHostToDevice)); + &num_particles, sizeof(float), cudaMemcpyHostToDevice)); return 0; } @@ -2838,12 +2836,12 @@ int ek_print_vtk_flux(int species, char *filename) { return 1; } - ekfloat flux_local_cartesian[3]; // temporary variable for converting fluxes - // into Cartesian coordinates for output + float flux_local_cartesian[3]; // temporary variable for converting fluxes + // into Cartesian coordinates for output unsigned int coord[3]; - std::vector fluxes(ek_parameters.number_of_nodes * 13); + std::vector fluxes(ek_parameters.number_of_nodes * 13); dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -2862,7 +2860,7 @@ int ek_print_vtk_flux(int species, char *filename) { #endif cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j, - fluxes.size() * sizeof(ekfloat), + fluxes.size() * sizeof(float), cudaMemcpyDeviceToHost)); fprintf(fp, "\ @@ -3063,8 +3061,8 @@ int ek_print_vtk_flux_fluc(int species, char *filename) { } FILE *fp = fopen(filename, "w"); - ekfloat flux_local_cartesian[3]; // temporary variable for converting fluxes - // into cartesian coordinates for output + float flux_local_cartesian[3]; // temporary variable for converting fluxes + // into cartesian coordinates for output unsigned int coord[3]; @@ -3072,7 +3070,7 @@ int ek_print_vtk_flux_fluc(int species, char *filename) { return 1; } - std::vector fluxes(ek_parameters.number_of_nodes * 13); + std::vector fluxes(ek_parameters.number_of_nodes * 13); dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -3090,7 +3088,7 @@ int ek_print_vtk_flux_fluc(int species, char *filename) { #endif cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j_fluc, - fluxes.size() * sizeof(ekfloat), + fluxes.size() * sizeof(float), cudaMemcpyDeviceToHost)); fprintf(fp, "\ @@ -3302,7 +3300,7 @@ int ek_print_vtk_flux_link(int species, char *filename) { unsigned int coord[3]; - std::vector fluxes(ek_parameters.number_of_nodes * 13); + std::vector fluxes(ek_parameters.number_of_nodes * 13); dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -3321,7 +3319,7 @@ int ek_print_vtk_flux_link(int species, char *filename) { #endif cuda_safe_mem(cudaMemcpy(fluxes.data(), ek_parameters.j, - fluxes.size() * sizeof(ekfloat), + fluxes.size() * sizeof(float), cudaMemcpyDeviceToHost)); fprintf(fp, "\ @@ -3808,22 +3806,22 @@ int ek_set_ext_force_density(int species, float ext_force_density_x, } struct ek_charge_of_particle { - __host__ __device__ ekfloat operator()(CUDA_particle_data particle) { + __host__ __device__ float operator()(CUDA_particle_data particle) { return particle.q; }; }; -ekfloat ek_get_particle_charge() { +float ek_get_particle_charge() { auto device_particles = gpu_get_particle_pointer(); - ekfloat particle_charge = thrust::transform_reduce( + float particle_charge = thrust::transform_reduce( thrust::device_ptr(device_particles.begin()), thrust::device_ptr(device_particles.end()), - ek_charge_of_particle(), 0.0f, thrust::plus()); + ek_charge_of_particle(), 0.0f, thrust::plus()); return particle_charge; } -ekfloat ek_calculate_net_charge() { - cuda_safe_mem(cudaMemset(charge_gpu, 0, sizeof(ekfloat))); +float ek_calculate_net_charge() { + cuda_safe_mem(cudaMemset(charge_gpu, 0, sizeof(float))); dim3 dim_grid = calculate_dim_grid(ek_parameters.number_of_nodes, 4, threads_per_block); @@ -3831,9 +3829,9 @@ ekfloat ek_calculate_net_charge() { KERNELCALL(ek_calculate_system_charge, dim_grid, threads_per_block, charge_gpu); - ekfloat charge; + float charge; cuda_safe_mem( - cudaMemcpy(&charge, charge_gpu, sizeof(ekfloat), cudaMemcpyDeviceToHost)); + cudaMemcpy(&charge, charge_gpu, sizeof(float), cudaMemcpyDeviceToHost)); if (ek_parameters.es_coupling) charge += ek_get_particle_charge(); @@ -3850,7 +3848,7 @@ int ek_neutralize_system(int species) { if (ek_parameters.valency[species_index] == 0.0f) return 2; - ekfloat compensating_species_density = 0.0f; + float compensating_species_density = 0.0f; #ifndef EK_BOUNDARIES for (unsigned i = 0; i < ek_parameters.number_of_species; i++) @@ -3862,22 +3860,22 @@ int ek_neutralize_system(int species) { compensating_species_density / ek_parameters.valency[species_index]; if (ek_parameters.es_coupling) { - ekfloat particle_charge = ek_get_particle_charge(); + float particle_charge = ek_get_particle_charge(); compensating_species_density -= particle_charge / ek_parameters.valency[species_index] / Utils::int_pow<3>(ek_parameters.agrid) / - ekfloat(ek_parameters.number_of_nodes); + float(ek_parameters.number_of_nodes); } #else - ekfloat charge = ek_calculate_net_charge(); + float charge = ek_calculate_net_charge(); compensating_species_density = ek_parameters.density[species_index] - (charge / ek_parameters.valency[species_index]) / (Utils::int_pow<3>(ek_parameters.agrid) * - ekfloat(static_cast(ek_parameters.number_of_nodes) - - ek_parameters.number_of_boundary_nodes)); + float(static_cast(ek_parameters.number_of_nodes) - + ek_parameters.number_of_boundary_nodes)); #endif // EK_BOUNDARIES if (compensating_species_density < 0.0f) @@ -3890,13 +3888,13 @@ int ek_neutralize_system(int species) { int ek_save_checkpoint(char *filename, char *lb_filename) { std::ofstream fout(filename, std::ofstream::binary); - std::vector densities(ek_parameters.number_of_nodes); + std::vector densities(ek_parameters.number_of_nodes); auto const nchars = - static_cast(densities.size() * sizeof(ekfloat)); + static_cast(densities.size() * sizeof(float)); for (unsigned i = 0; i < ek_parameters.number_of_species; i++) { cuda_safe_mem(cudaMemcpy(densities.data(), ek_parameters.rho[i], - densities.size() * sizeof(ekfloat), + densities.size() * sizeof(float), cudaMemcpyDeviceToHost)); if (!fout.write(reinterpret_cast(densities.data()), nchars)) { @@ -3915,9 +3913,9 @@ int ek_load_checkpoint(char *filename) { std::string fname(filename); std::ifstream fin((const char *)(fname + ".ek").c_str(), std::ifstream::binary); - std::vector densities(ek_parameters.number_of_nodes); + std::vector densities(ek_parameters.number_of_nodes); auto const nchars = - static_cast(densities.size() * sizeof(ekfloat)); + static_cast(densities.size() * sizeof(float)); for (unsigned i = 0; i < ek_parameters.number_of_species; i++) { if (!fin.read(reinterpret_cast(densities.data()), nchars)) { @@ -3926,7 +3924,7 @@ int ek_load_checkpoint(char *filename) { } cuda_safe_mem(cudaMemcpy(ek_parameters.rho[i], densities.data(), - densities.size() * sizeof(ekfloat), + densities.size() * sizeof(float), cudaMemcpyHostToDevice)); } diff --git a/src/core/grid_based_algorithms/lb_boundaries.cpp b/src/core/grid_based_algorithms/lb_boundaries.cpp index 3f7338ef2b3..5ddae81cf10 100644 --- a/src/core/grid_based_algorithms/lb_boundaries.cpp +++ b/src/core/grid_based_algorithms/lb_boundaries.cpp @@ -75,7 +75,7 @@ void ek_init_boundaries() { #if defined(CUDA) && defined(EK_BOUNDARIES) int number_of_boundnodes = 0; - std::vector host_wallcharge_species_density; + std::vector host_wallcharge_species_density; float node_wallcharge = 0.0f; int wallcharge_species = -1, charged_boundaries = 0; bool node_charged = false; diff --git a/src/core/grid_based_algorithms/lbgpu.hpp b/src/core/grid_based_algorithms/lbgpu.hpp index 3daf39e0bec..3c02b4f12ae 100644 --- a/src/core/grid_based_algorithms/lbgpu.hpp +++ b/src/core/grid_based_algorithms/lbgpu.hpp @@ -43,11 +43,7 @@ * thus making the code more efficient. */ #define LBQ 19 -#if defined(LB_DOUBLE_PREC) || defined(EK_DOUBLE_PREC) -typedef double lbForceFloat; -#else typedef float lbForceFloat; -#endif /** Parameters for the lattice Boltzmann system for GPU. */ struct LB_parameters_gpu { diff --git a/src/python/espressomd/electrokinetics.pxd b/src/python/espressomd/electrokinetics.pxd index 7be18e61e6f..4c6fa8bad7e 100644 --- a/src/python/espressomd/electrokinetics.pxd +++ b/src/python/espressomd/electrokinetics.pxd @@ -20,11 +20,6 @@ from libcpp cimport bool IF ELECTROKINETICS and CUDA: cdef extern from "grid_based_algorithms/electrokinetics.hpp": - IF EK_DOUBLE_PREC: - ctypedef double ekfloat - ELSE: - ctypedef float ekfloat - DEF MAX_NUMBER_OF_SPECIES = 10 # EK data struct @@ -63,10 +58,10 @@ IF ELECTROKINETICS and CUDA: bool advection bool fluidcoupling_ideal_contribution float * charge_potential - ekfloat * j + float * j float * lb_force_density_previous - ekfloat * j_fluc - ekfloat * rho[MAX_NUMBER_OF_SPECIES] + float * j_fluc + float * rho[MAX_NUMBER_OF_SPECIES] int species_index[MAX_NUMBER_OF_SPECIES] float density[MAX_NUMBER_OF_SPECIES] float D[MAX_NUMBER_OF_SPECIES] @@ -112,9 +107,9 @@ IF ELECTROKINETICS and CUDA: bool advection bool fluidcoupling_ideal_contribution float * charge_potential - ekfloat * j + float * j float * lb_force_density_previous - ekfloat * rho[MAX_NUMBER_OF_SPECIES] + float * rho[MAX_NUMBER_OF_SPECIES] int species_index[MAX_NUMBER_OF_SPECIES] float density[MAX_NUMBER_OF_SPECIES] float D[MAX_NUMBER_OF_SPECIES] @@ -164,7 +159,7 @@ IF ELECTROKINETICS and CUDA: int ek_node_print_flux(int species, int x, int y, int z, double * flux) int ek_node_print_potential(int x, int y, int z, double * potential) int ek_node_set_density(int species, int x, int y, int z, double density) - ekfloat ek_calculate_net_charge() + float ek_calculate_net_charge() int ek_neutralize_system(int species) int ek_save_checkpoint(char * filename, char * lb_filename) int ek_load_checkpoint(char * filename) From 955ad2a71f9583af644fb35815d8015708ab1a0e Mon Sep 17 00:00:00 2001 From: Kai Szuttor Date: Tue, 30 Mar 2021 15:20:21 +0200 Subject: [PATCH 2/4] cuda: remove lbForceFloat. --- .../electrokinetics_cuda.cu | 9 ++++---- src/core/grid_based_algorithms/lbgpu.hpp | 2 -- src/core/grid_based_algorithms/lbgpu_cuda.cu | 23 ++++++++----------- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/core/grid_based_algorithms/electrokinetics_cuda.cu b/src/core/grid_based_algorithms/electrokinetics_cuda.cu index 49169fadb19..d74cce80544 100644 --- a/src/core/grid_based_algorithms/electrokinetics_cuda.cu +++ b/src/core/grid_based_algorithms/electrokinetics_cuda.cu @@ -3478,12 +3478,11 @@ int ek_print_vtk_lbforce_density(char *filename) { return 1; } - std::vector lbforce_density(ek_parameters.number_of_nodes * 3); + std::vector lbforce_density(ek_parameters.number_of_nodes * 3); - cuda_safe_mem( - cudaMemcpy(lbforce_density.data(), node_f.force_density_buf, - ek_parameters.number_of_nodes * 3 * sizeof(lbForceFloat), - cudaMemcpyDeviceToHost)); + cuda_safe_mem(cudaMemcpy(lbforce_density.data(), node_f.force_density_buf, + ek_parameters.number_of_nodes * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); fprintf(fp, "\ # vtk DataFile Version 2.0\n\ diff --git a/src/core/grid_based_algorithms/lbgpu.hpp b/src/core/grid_based_algorithms/lbgpu.hpp index 3c02b4f12ae..759a08e410f 100644 --- a/src/core/grid_based_algorithms/lbgpu.hpp +++ b/src/core/grid_based_algorithms/lbgpu.hpp @@ -43,8 +43,6 @@ * thus making the code more efficient. */ #define LBQ 19 -typedef float lbForceFloat; - /** Parameters for the lattice Boltzmann system for GPU. */ struct LB_parameters_gpu { /** number density (LB units) */ diff --git a/src/core/grid_based_algorithms/lbgpu_cuda.cu b/src/core/grid_based_algorithms/lbgpu_cuda.cu index 230ce067f55..897dff5d7cf 100644 --- a/src/core/grid_based_algorithms/lbgpu_cuda.cu +++ b/src/core/grid_based_algorithms/lbgpu_cuda.cu @@ -108,11 +108,6 @@ static bool *device_gpu_lb_initialized = nullptr; */ static bool intflag = true; LB_nodes_gpu *current_nodes = nullptr; -/** @name defining size values for allocating global memory */ -/**@{*/ -static size_t size_of_rho_v; -static size_t size_of_rho_v_pi; -/**@}*/ /** Parameters residing in constant memory */ __device__ __constant__ LB_parameters_gpu para[1]; @@ -2077,15 +2072,14 @@ void lb_init_GPU(const LB_parameters_gpu &lbpar_gpu) { cudaMemset(var, 0, size); \ } - size_of_rho_v = lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_gpu); - size_of_rho_v_pi = lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu); - /* Allocate structs in device memory*/ - free_realloc_and_clear(device_rho_v, size_of_rho_v); + free_realloc_and_clear(device_rho_v, + lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_gpu)); /* TODO: this is almost a copy of device_rho_v; think about eliminating * it, and maybe pi can be added to device_rho_v in this case */ - free_realloc_and_clear(print_rho_v_pi, size_of_rho_v_pi); + free_realloc_and_clear(print_rho_v_pi, + lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu)); free_realloc_and_clear(nodes_a.populations, lbpar_gpu.number_of_nodes * sizeof(Utils::Array)); @@ -2093,10 +2087,12 @@ void lb_init_GPU(const LB_parameters_gpu &lbpar_gpu) { lbpar_gpu.number_of_nodes * sizeof(Utils::Array)); free_realloc_and_clear(node_f.force_density, - lbpar_gpu.number_of_nodes * 3 * sizeof(lbForceFloat)); + lbpar_gpu.number_of_nodes * + sizeof(Utils::Array)); #if defined(VIRTUAL_SITES_INERTIALESS_TRACERS) || defined(EK_DEBUG) free_realloc_and_clear(node_f.force_density_buf, - lbpar_gpu.number_of_nodes * 3 * sizeof(lbForceFloat)); + lbpar_gpu.number_of_nodes * + sizeof(Utils::Array)); #endif free_realloc_and_clear(boundaries.index, lbpar_gpu.number_of_nodes * sizeof(unsigned int)); @@ -2276,7 +2272,8 @@ void lb_get_values_GPU(LB_rho_v_pi_gpu *host_values) { KERNELCALL(get_mesoscopic_values_in_LB_units, dim_grid, threads_per_block, *current_nodes, print_rho_v_pi, device_rho_v, node_f); - cuda_safe_mem(cudaMemcpy(host_values, print_rho_v_pi, size_of_rho_v_pi, + cuda_safe_mem(cudaMemcpy(host_values, print_rho_v_pi, + lbpar_gpu.number_of_nodes * sizeof(LB_rho_v_pi_gpu), cudaMemcpyDeviceToHost)); } From 41378353ad168979d75eb89ee04ece724dbcf9fc Mon Sep 17 00:00:00 2001 From: Kai Szuttor Date: Tue, 30 Mar 2021 15:27:54 +0200 Subject: [PATCH 3/4] cuda: remove dds_float. --- src/core/actor/DipolarBarnesHut.hpp | 3 - src/core/actor/DipolarBarnesHut_cuda.cu | 28 ++++---- src/core/actor/DipolarBarnesHut_cuda.cuh | 6 +- src/core/actor/DipolarDirectSum.hpp | 19 +++-- src/core/actor/DipolarDirectSum_cuda.cu | 91 +++++++++++------------- 5 files changed, 65 insertions(+), 82 deletions(-) diff --git a/src/core/actor/DipolarBarnesHut.hpp b/src/core/actor/DipolarBarnesHut.hpp index b010ebe3804..f4019c34d29 100644 --- a/src/core/actor/DipolarBarnesHut.hpp +++ b/src/core/actor/DipolarBarnesHut.hpp @@ -33,9 +33,6 @@ #include -// This needs to be done in the .cu file too -typedef float dds_float; - class DipolarBarnesHut : public Actor { public: DipolarBarnesHut(SystemInterface &s, float epssq, float itolsq) { diff --git a/src/core/actor/DipolarBarnesHut_cuda.cu b/src/core/actor/DipolarBarnesHut_cuda.cu index 1b6b72c4c61..f2e2185b896 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cu +++ b/src/core/actor/DipolarBarnesHut_cuda.cu @@ -36,12 +36,8 @@ #include -typedef float dds_float; - #define IND (blockDim.x * blockIdx.x + threadIdx.x) -using namespace std; - // Method performance/accuracy parameters __constant__ float epssqd[1], itolsqd[1]; // blkcntd is a factual blocks' count. @@ -57,7 +53,7 @@ __device__ __constant__ volatile BHData bhpara[1]; // The "half-convolution" multi-thread reduction. // The thread with a lower index will operate longer and // final result (here: the sum) is flowing down towards zero thread. -__device__ void dds_sumReduction_BH(dds_float *input, dds_float *sum) { +__device__ void dds_sumReduction_BH(float *input, float *sum) { auto tid = static_cast(threadIdx.x); for (auto i = static_cast(blockDim.x); i > 1; i /= 2) { __syncthreads(); @@ -690,7 +686,7 @@ __global__ __launch_bounds__(THREADS4, FACTOR4) void sortKernel() { /******************************************************************************/ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( - dds_float pf, float *force, float *torque) { + float pf, float *force, float *torque) { int i, j, k, l, n, depth, base, sbase, diff, t; float tmp; // dr is a distance between particles. @@ -902,7 +898,7 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void forceCalculationKernel( /******************************************************************************/ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( - dds_float pf, dds_float *energySum) { + float pf, float *energySum) { // NOTE: the algorithm of this kernel is almost identical to // forceCalculationKernel. See comments there. @@ -912,8 +908,8 @@ __global__ __launch_bounds__(THREADS5, FACTOR5) void energyCalculationKernel( __shared__ int pos[MAXDEPTH * THREADS5 / WARPSIZE], node[MAXDEPTH * THREADS5 / WARPSIZE]; __shared__ float dq[MAXDEPTH * THREADS5 / WARPSIZE]; - dds_float sum = 0.0; - extern __shared__ dds_float res[]; + float sum = 0.0; + extern __shared__ float res[]; float b, d1, dd5; @@ -1120,7 +1116,7 @@ void sortBH(int blocks) { } // Force calculation. -int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) { +int forceBH(BHData *bh_data, float k, float *f, float *torque) { int error_code = 0; dim3 grid(1, 1, 1); dim3 block(1, 1, 1); @@ -1137,7 +1133,7 @@ int forceBH(BHData *bh_data, dds_float k, float *f, float *torque) { } // Energy calculation. -int energyBH(BHData *bh_data, dds_float k, float *E) { +int energyBH(BHData *bh_data, float k, float *E) { int error_code = 0; dim3 grid(1, 1, 1); dim3 block(1, 1, 1); @@ -1145,18 +1141,18 @@ int energyBH(BHData *bh_data, dds_float k, float *E) { grid.x = bh_data->blocks * FACTOR5; block.x = THREADS5; - dds_float *energySum; - cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x))); + float *energySum; + cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x))); // cleanup the memory for the energy sum - cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(dds_float) * grid.x))); + cuda_safe_mem(cudaMemset(energySum, 0, (int)(sizeof(float) * grid.x))); KERNELCALL_shared(energyCalculationKernel, grid, block, - block.x * sizeof(dds_float), k, energySum); + block.x * sizeof(float), k, energySum); cuda_safe_mem(cudaDeviceSynchronize()); // Sum the results of all blocks // One energy part per block in the prev kernel - thrust::device_ptr t(energySum); + thrust::device_ptr t(energySum); float x = thrust::reduce(t, t + grid.x); cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice)); diff --git a/src/core/actor/DipolarBarnesHut_cuda.cuh b/src/core/actor/DipolarBarnesHut_cuda.cuh index 95079830c2e..19b542c363b 100644 --- a/src/core/actor/DipolarBarnesHut_cuda.cuh +++ b/src/core/actor/DipolarBarnesHut_cuda.cuh @@ -25,8 +25,6 @@ #ifdef DIPOLAR_BARNES_HUT -typedef float dds_float; - typedef struct { /// CUDA blocks int blocks; @@ -117,10 +115,10 @@ void summarizeBH(int blocks); void sortBH(int blocks); /// Barnes-Hut force calculation. -int forceBH(BHData *bh_data, dds_float k, float *f, float *torque); +int forceBH(BHData *bh_data, float k, float *f, float *torque); /// Barnes-Hut energy calculation. -int energyBH(BHData *bh_data, dds_float k, float *E); +int energyBH(BHData *bh_data, float k, float *E); #endif // DIPOLAR_BARNES_HUT #endif /* DIPOLARBARNESHUT_CUH_ */ diff --git a/src/core/actor/DipolarDirectSum.hpp b/src/core/actor/DipolarDirectSum.hpp index 337fe50eab6..809bec5b828 100644 --- a/src/core/actor/DipolarDirectSum.hpp +++ b/src/core/actor/DipolarDirectSum.hpp @@ -32,15 +32,12 @@ #include -// This needs to be done in the .cu file too!!!!! -typedef float dds_float; - -void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos, - float *dip, dds_float box_l[3], +void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos, + float *dip, float box_l[3], int periodic[3], float *E); -void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos, +void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, float *dip, float *f, float *torque, - dds_float box_l[3], int periodic[3]); + float box_l[3], int periodic[3]); class DipolarDirectSum : public Actor { public: @@ -58,10 +55,10 @@ class DipolarDirectSum : public Actor { runtimeErrorMsg() << "DipolarDirectSum needs access to dipoles on GPU!"; }; void computeForces(SystemInterface &s) override { - dds_float box[3]; + float box[3]; int per[3]; for (int i = 0; i < 3; i++) { - box[i] = static_cast(s.box()[i]); + box[i] = static_cast(s.box()[i]); per[i] = (box_geo.periodic(i)); } DipolarDirectSum_kernel_wrapper_force( @@ -69,10 +66,10 @@ class DipolarDirectSum : public Actor { s.fGpuBegin(), s.torqueGpuBegin(), box, per); }; void computeEnergy(SystemInterface &s) override { - dds_float box[3]; + float box[3]; int per[3]; for (int i = 0; i < 3; i++) { - box[i] = static_cast(s.box()[i]); + box[i] = static_cast(s.box()[i]); per[i] = (box_geo.periodic(i)); } DipolarDirectSum_kernel_wrapper_energy( diff --git a/src/core/actor/DipolarDirectSum_cuda.cu b/src/core/actor/DipolarDirectSum_cuda.cu index 278bf4e9eb4..02df0d4e9e7 100644 --- a/src/core/actor/DipolarDirectSum_cuda.cu +++ b/src/core/actor/DipolarDirectSum_cuda.cu @@ -32,12 +32,8 @@ #error CU-file includes mpi.h! This should not happen! #endif -// This needs to be done in the .cu file too!!!!! -typedef float dds_float; - -__device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3], - dds_float const b[3], - dds_float const box_l[3], +__device__ inline void get_mi_vector_dds(float res[3], float const a[3], + float const b[3], float const box_l[3], int const periodic[3]) { for (int i = 0; i < 3; i++) { res[i] = a[i] - b[i]; @@ -48,19 +44,19 @@ __device__ inline void get_mi_vector_dds(dds_float res[3], dds_float const a[3], #define scalar(a, b) ((a)[0] * (b)[0] + (a)[1] * (b)[1] + (a)[2] * (b)[2]) -__device__ void dipole_ia_force(int id, dds_float pf, float const *r1, +__device__ void dipole_ia_force(int id, float pf, float const *r1, float const *r2, float const *dip1, - float const *dip2, dds_float *f1, - dds_float *torque1, dds_float *torque2, - dds_float box_l[3], int periodic[3]) { - dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, - a, b, cc, d, ab; + float const *dip2, float *f1, float *torque1, + float *torque2, float box_l[3], + int periodic[3]) { + float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv, r7_inv, a, b, + cc, d, ab; #ifdef ROTATION - dds_float bx, by, bz, ax, ay, az; + float bx, by, bz, ax, ay, az; #endif - dds_float dr[3]; - dds_float _r1[3], _r2[3], _dip1[3], _dip2[3]; + float dr[3]; + float _r1[3], _r2[3], _dip1[3], _dip2[3]; for (int i = 0; i < 3; i++) { _r1[i] = r1[i]; @@ -121,13 +117,13 @@ __device__ void dipole_ia_force(int id, dds_float pf, float const *r1, #endif } -__device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1, - float const *r2, float const *dip1, - float const *dip2, dds_float box_l[3], - int periodic[3]) { - dds_float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv; - dds_float dr[3]; - dds_float _r1[3], _r2[3], _dip1[3], _dip2[3]; +__device__ float dipole_ia_energy(int id, float pf, float const *r1, + float const *r2, float const *dip1, + float const *dip2, float box_l[3], + int periodic[3]) { + float r_inv, pe1, pe2, pe3, pe4, r_sq, r3_inv, r5_inv, r_sq_inv; + float dr[3]; + float _r1[3], _r2[3], _dip1[3], _dip2[3]; for (int i = 0; i < 3; i++) { _r1[i] = r1[i]; @@ -156,9 +152,9 @@ __device__ dds_float dipole_ia_energy(int id, dds_float pf, float const *r1, return pf * (pe1 * r3_inv - pe4 * pe2 * pe3); } -__global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos, +__global__ void DipolarDirectSum_kernel_force(float pf, int n, float *pos, float *dip, float *f, - float *torque, dds_float box_l[3], + float *torque, float box_l[3], int periodic[3]) { auto i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); @@ -168,10 +164,10 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos, // Kahan summation based on the wikipedia article // Force - dds_float fi[3], fsum[3], tj[3]; + float fi[3], fsum[3], tj[3]; // Torque - dds_float ti[3], tsum[3]; + float ti[3], tsum[3]; // There is one thread per particle. Each thread computes interactions // with particles whose id is smaller than the thread id. @@ -207,7 +203,7 @@ __global__ void DipolarDirectSum_kernel_force(dds_float pf, int n, float *pos, } } -__device__ void dds_sumReduction(dds_float *input, dds_float *sum) { +__device__ void dds_sumReduction(float *input, float *sum) { auto tid = static_cast(threadIdx.x); for (auto i = static_cast(blockDim.x); i > 1; i /= 2) { __syncthreads(); @@ -222,14 +218,14 @@ __device__ void dds_sumReduction(dds_float *input, dds_float *sum) { } } -__global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos, - float *dip, dds_float box_l[3], +__global__ void DipolarDirectSum_kernel_energy(float pf, int n, float *pos, + float *dip, float box_l[3], int periodic[3], - dds_float *energySum) { + float *energySum) { auto i = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - dds_float sum = 0.0; - extern __shared__ dds_float res[]; + float sum = 0.0; + extern __shared__ float res[]; // There is one thread per particle. Each thread computes interactions // with particles whose id is larger than the thread id. @@ -253,10 +249,9 @@ __global__ void DipolarDirectSum_kernel_energy(dds_float pf, int n, float *pos, dds_sumReduction(res, &(energySum[blockIdx.x])); } -void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos, +void DipolarDirectSum_kernel_wrapper_force(float k, int n, float *pos, float *dip, float *f, float *torque, - dds_float box_l[3], - int periodic[3]) { + float box_l[3], int periodic[3]) { const int bs = 64; dim3 grid(1, 1, 1); @@ -273,12 +268,12 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos, block.x = bs; } - dds_float *box_l_gpu; + float *box_l_gpu; int *periodic_gpu; - cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float))); + cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float))); cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int))); - cuda_safe_mem(cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(dds_float), - cudaMemcpyHostToDevice)); + cuda_safe_mem( + cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice)); cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int), cudaMemcpyHostToDevice)); @@ -288,8 +283,8 @@ void DipolarDirectSum_kernel_wrapper_force(dds_float k, int n, float *pos, cudaFree(periodic_gpu); } -void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos, - float *dip, dds_float box_l[3], +void DipolarDirectSum_kernel_wrapper_energy(float k, int n, float *pos, + float *dip, float box_l[3], int periodic[3], float *E) { const int bs = 512; @@ -307,27 +302,27 @@ void DipolarDirectSum_kernel_wrapper_energy(dds_float k, int n, float *pos, block.x = bs; } - dds_float *box_l_gpu; + float *box_l_gpu; int *periodic_gpu; - cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(dds_float))); + cuda_safe_mem(cudaMalloc((void **)&box_l_gpu, 3 * sizeof(float))); cuda_safe_mem(cudaMalloc((void **)&periodic_gpu, 3 * sizeof(int))); cuda_safe_mem( cudaMemcpy(box_l_gpu, box_l, 3 * sizeof(float), cudaMemcpyHostToDevice)); cuda_safe_mem(cudaMemcpy(periodic_gpu, periodic, 3 * sizeof(int), cudaMemcpyHostToDevice)); - dds_float *energySum; - cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(dds_float) * grid.x))); + float *energySum; + cuda_safe_mem(cudaMalloc(&energySum, (int)(sizeof(float) * grid.x))); // This will sum the energies up to the block level KERNELCALL_shared(DipolarDirectSum_kernel_energy, grid, block, - bs * sizeof(dds_float), k, n, pos, dip, box_l_gpu, - periodic_gpu, energySum); + bs * sizeof(float), k, n, pos, dip, box_l_gpu, periodic_gpu, + energySum); // Sum the results of all blocks // One thread per block in the prev kernel // KERNELCALL(sumKernel,1,1,energySum,block.x,E); - thrust::device_ptr t(energySum); + thrust::device_ptr t(energySum); float x = thrust::reduce(t, t + grid.x); cuda_safe_mem(cudaMemcpy(E, &x, sizeof(float), cudaMemcpyHostToDevice)); From 3016f392168b1c645bff1003b65ab8b4adf7db02 Mon Sep 17 00:00:00 2001 From: Kai Szuttor Date: Tue, 30 Mar 2021 16:12:27 +0200 Subject: [PATCH 4/4] cuda: remove mmm1dgpu_real. --- src/core/actor/Mmm1dgpuForce.hpp | 23 ++- src/core/actor/Mmm1dgpuForce_cuda.cu | 205 +++++++++++------------ src/core/actor/specfunc_cuda.hpp | 50 +++--- src/python/espressomd/electrostatics.pxd | 21 ++- src/python/espressomd/electrostatics.pyx | 2 +- 5 files changed, 141 insertions(+), 160 deletions(-) diff --git a/src/core/actor/Mmm1dgpuForce.hpp b/src/core/actor/Mmm1dgpuForce.hpp index 5616e8a51ad..24a8c2fc9d8 100644 --- a/src/core/actor/Mmm1dgpuForce.hpp +++ b/src/core/actor/Mmm1dgpuForce.hpp @@ -26,25 +26,22 @@ #include "Actor.hpp" #include "SystemInterface.hpp" -typedef float mmm1dgpu_real; - class Mmm1dgpuForce : public Actor { public: // constructor - Mmm1dgpuForce(SystemInterface &s, mmm1dgpu_real coulomb_prefactor, - mmm1dgpu_real maxPWerror, mmm1dgpu_real far_switch_radius = -1, - int bessel_cutoff = -1); + Mmm1dgpuForce(SystemInterface &s, float coulomb_prefactor, float maxPWerror, + float far_switch_radius = -1, int bessel_cutoff = -1); ~Mmm1dgpuForce() override; // interface methods void computeForces(SystemInterface &s) override; void computeEnergy(SystemInterface &s) override; // configuration methods void setup(SystemInterface &s); - void tune(SystemInterface &s, mmm1dgpu_real _maxPWerror, - mmm1dgpu_real _far_switch_radius, int _bessel_cutoff); - void set_params(mmm1dgpu_real _boxz, mmm1dgpu_real _coulomb_prefactor, - mmm1dgpu_real _maxPWerror, mmm1dgpu_real _far_switch_radius, - int _bessel_cutoff, bool manual = false); + void tune(SystemInterface &s, float _maxPWerror, float _far_switch_radius, + int _bessel_cutoff); + void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, + float _far_switch_radius, int _bessel_cutoff, + bool manual = false); void activate(); void deactivate(); @@ -55,7 +52,7 @@ class Mmm1dgpuForce : public Actor { // the box length currently set on the GPU // Needed to make sure it hasn't been modified after inter coulomb was used. - mmm1dgpu_real host_boxz; + float host_boxz; // the number of particles we had during the last run. Needed to check if we // have to realloc dev_forcePairs int host_npart; @@ -66,10 +63,10 @@ class Mmm1dgpuForce : public Actor { // pairs==2: return forces using a global memory reduction int pairs; // variables for forces and energies calculated pre-reduction - mmm1dgpu_real *dev_forcePairs, *dev_energyBlocks; + float *dev_forcePairs, *dev_energyBlocks; // MMM1D parameters - mmm1dgpu_real coulomb_prefactor, maxPWerror, far_switch_radius; + float coulomb_prefactor, maxPWerror, far_switch_radius; int bessel_cutoff; // run a single force calculation and return the time it takes using diff --git a/src/core/actor/Mmm1dgpuForce_cuda.cu b/src/core/actor/Mmm1dgpuForce_cuda.cu index e217e652e5e..f2b138c2af1 100644 --- a/src/core/actor/Mmm1dgpuForce_cuda.cu +++ b/src/core/actor/Mmm1dgpuForce_cuda.cu @@ -50,12 +50,12 @@ const int deviceCount = 1; #undef cudaSetDevice #define cudaSetDevice(d) -__constant__ mmm1dgpu_real far_switch_radius_2[1] = {0.05f * 0.05f}; -__constant__ mmm1dgpu_real boxz[1]; -__constant__ mmm1dgpu_real uz[1]; -__constant__ mmm1dgpu_real coulomb_prefactor[1] = {1.0f}; +__constant__ float far_switch_radius_2[1] = {0.05f * 0.05f}; +__constant__ float boxz[1]; +__constant__ float uz[1]; +__constant__ float coulomb_prefactor[1] = {1.0f}; __constant__ int bessel_cutoff[1] = {5}; -__constant__ mmm1dgpu_real maxPWerror[1] = {1e-5f}; +__constant__ float maxPWerror[1] = {1e-5f}; // order hardcoded. mmm1d_recalcTables() typically does order less than 30. // As the coefficients are stored in __constant__ memory, the array needs to be @@ -68,15 +68,15 @@ const int modpsi_constant_size = modpsi_order * modpsi_order * 2; __constant__ int device_n_modPsi[1] = {0}; __constant__ int device_linModPsi_offsets[2 * modpsi_order], device_linModPsi_lengths[2 * modpsi_order]; -__constant__ mmm1dgpu_real device_linModPsi[modpsi_constant_size]; +__constant__ float device_linModPsi[modpsi_constant_size]; -__device__ mmm1dgpu_real dev_mod_psi_even(int n, mmm1dgpu_real x) { +__device__ float dev_mod_psi_even(int n, float x) { return evaluateAsTaylorSeriesAt( &device_linModPsi[device_linModPsi_offsets[2 * n]], device_linModPsi_lengths[2 * n], x * x); } -__device__ mmm1dgpu_real dev_mod_psi_odd(int n, mmm1dgpu_real x) { +__device__ float dev_mod_psi_odd(int n, float x) { return x * evaluateAsTaylorSeriesAt( &device_linModPsi[device_linModPsi_offsets[2 * n + 1]], device_linModPsi_lengths[2 * n + 1], x * x); @@ -96,12 +96,11 @@ int modpsi_init() { } // linearize the coefficients array - std::vector linModPsi(linModPsi_offsets[modPsi.size() - 1] + - linModPsi_lengths[modPsi.size() - 1]); + std::vector linModPsi(linModPsi_offsets[modPsi.size() - 1] + + linModPsi_lengths[modPsi.size() - 1]); for (size_t i = 0; i < modPsi.size(); i++) { for (size_t j = 0; j < modPsi[i].size(); j++) { - linModPsi[linModPsi_offsets[i] + j] = - static_cast(modPsi[i][j]); + linModPsi[linModPsi_offsets[i] + j] = static_cast(modPsi[i][j]); } } @@ -122,7 +121,7 @@ int modpsi_init() { linModPsi_lengths.data(), modPsi.size() * sizeof(int))); cuda_safe_mem(cudaMemcpyToSymbol(device_linModPsi, linModPsi.data(), - linModPsiSize * sizeof(mmm1dgpu_real))); + linModPsiSize * sizeof(float))); auto const n_modPsi = static_cast(modPsi.size() >> 1); cuda_safe_mem(cudaMemcpyToSymbol(device_n_modPsi, &n_modPsi, sizeof(int))); } @@ -130,10 +129,8 @@ int modpsi_init() { return 0; } -Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, - mmm1dgpu_real _coulomb_prefactor, - mmm1dgpu_real _maxPWerror, - mmm1dgpu_real _far_switch_radius, +Mmm1dgpuForce::Mmm1dgpuForce(SystemInterface &s, float _coulomb_prefactor, + float _maxPWerror, float _far_switch_radius, int _bessel_cutoff) : numThreads(64), host_boxz(0), host_npart(0), need_tune(true), pairs(-1), dev_forcePairs(nullptr), dev_energyBlocks(nullptr), @@ -161,13 +158,13 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { "Error: Please set box length before initializing MMM1D!"); } if (need_tune && s.npart_gpu() > 0) { - set_params(static_cast(s.box()[2]), - static_cast(coulomb.prefactor), maxPWerror, + set_params(static_cast(s.box()[2]), + static_cast(coulomb.prefactor), maxPWerror, far_switch_radius, bessel_cutoff); tune(s, maxPWerror, far_switch_radius, bessel_cutoff); } if (s.box()[2] != host_boxz) { - set_params(static_cast(s.box()[2]), 0, -1, -1, -1); + set_params(static_cast(s.box()[2]), 0, -1, -1, -1); } if (s.npart_gpu() == host_npart) // unchanged { @@ -185,8 +182,7 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { cudaMemGetInfo(&freeMem, &totalMem); if (freeMem / 2 < 3 * s.npart_gpu() * s.npart_gpu() * - sizeof( - mmm1dgpu_real)) // don't use more than half the device's memory + sizeof(float)) // don't use more than half the device's memory { std::cerr << "Switching to atomicAdd due to memory constraints." << std::endl; @@ -200,12 +196,12 @@ void Mmm1dgpuForce::setup(SystemInterface &s) { { cuda_safe_mem( cudaMalloc((void **)&dev_forcePairs, - 3 * s.npart_gpu() * s.npart_gpu() * sizeof(mmm1dgpu_real))); + 3 * s.npart_gpu() * s.npart_gpu() * sizeof(float))); } if (dev_energyBlocks) cudaFree(dev_energyBlocks); - cuda_safe_mem(cudaMalloc((void **)&dev_energyBlocks, - numBlocks(s) * sizeof(mmm1dgpu_real))); + cuda_safe_mem( + cudaMalloc((void **)&dev_energyBlocks, numBlocks(s) * sizeof(float))); host_npart = static_cast(s.npart_gpu()); } @@ -218,14 +214,10 @@ unsigned int Mmm1dgpuForce::numBlocks(SystemInterface &s) { Mmm1dgpuForce::~Mmm1dgpuForce() { cudaFree(dev_forcePairs); } -__forceinline__ __device__ mmm1dgpu_real sqpow(mmm1dgpu_real x) { - return x * x; -} -__forceinline__ __device__ mmm1dgpu_real cbpow(mmm1dgpu_real x) { - return x * x * x; -} +__forceinline__ __device__ float sqpow(float x) { return x * x; } +__forceinline__ __device__ float cbpow(float x) { return x * x * x; } -__device__ void sumReduction(mmm1dgpu_real *input, mmm1dgpu_real *sum) { +__device__ void sumReduction(float *input, float *sum) { auto tid = static_cast(threadIdx.x); for (auto i = static_cast(blockDim.x) / 2; i > 0; i /= 2) { __syncthreads(); @@ -237,12 +229,12 @@ __device__ void sumReduction(mmm1dgpu_real *input, mmm1dgpu_real *sum) { sum[0] = input[0]; } -__global__ void sumKernel(mmm1dgpu_real *data, int N) { - extern __shared__ mmm1dgpu_real partialsums[]; +__global__ void sumKernel(float *data, int N) { + extern __shared__ float partialsums[]; if (blockIdx.x != 0) return; auto tid = static_cast(threadIdx.x); - mmm1dgpu_real result = 0; + float result = 0; for (int i = 0; i < N; i += static_cast(blockDim.x)) { if (i + tid >= N) @@ -259,16 +251,16 @@ __global__ void sumKernel(mmm1dgpu_real *data, int N) { } } -__global__ void besselTuneKernel(int *result, mmm1dgpu_real far_switch_radius, +__global__ void besselTuneKernel(int *result, float far_switch_radius, int maxCut) { - const mmm1dgpu_real c_2pif = 2 * Utils::pi(); - mmm1dgpu_real arg = c_2pif * *uz * far_switch_radius; - mmm1dgpu_real pref = 4 * *uz * max(1.0f, c_2pif * *uz); - mmm1dgpu_real err; + const float c_2pif = 2 * Utils::pi(); + float arg = c_2pif * *uz * far_switch_radius; + float pref = 4 * *uz * max(1.0f, c_2pif * *uz); + float err; int P = 1; do { - err = pref * dev_K1(arg * static_cast(P)) * exp(arg) / arg * - (static_cast(P) - 1 + 1 / arg); + err = pref * dev_K1(arg * static_cast(P)) * exp(arg) / arg * + (static_cast(P) - 1 + 1 / arg); P++; } while (err > *maxPWerror && P <= maxCut); P--; @@ -276,16 +268,16 @@ __global__ void besselTuneKernel(int *result, mmm1dgpu_real far_switch_radius, result[0] = P; } -void Mmm1dgpuForce::tune(SystemInterface &s, mmm1dgpu_real _maxPWerror, - mmm1dgpu_real _far_switch_radius, int _bessel_cutoff) { - mmm1dgpu_real far_switch_radius = _far_switch_radius; +void Mmm1dgpuForce::tune(SystemInterface &s, float _maxPWerror, + float _far_switch_radius, int _bessel_cutoff) { + float far_switch_radius = _far_switch_radius; int bessel_cutoff = _bessel_cutoff; - mmm1dgpu_real maxrad = host_boxz; + float maxrad = host_boxz; if (_far_switch_radius < 0 && _bessel_cutoff < 0) // autodetermine switching radius and Bessel cutoff { - mmm1dgpu_real bestrad = 0, besttime = INFINITY; + float bestrad = 0, besttime = INFINITY; // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter) for (far_switch_radius = 0.05f * maxrad; far_switch_radius < maxrad; @@ -328,17 +320,15 @@ void Mmm1dgpuForce::tune(SystemInterface &s, mmm1dgpu_real _maxPWerror, } } -void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz, - mmm1dgpu_real _coulomb_prefactor, - mmm1dgpu_real _maxPWerror, - mmm1dgpu_real _far_switch_radius, +void Mmm1dgpuForce::set_params(float _boxz, float _coulomb_prefactor, + float _maxPWerror, float _far_switch_radius, int _bessel_cutoff, bool manual) { if (_boxz > 0 && _far_switch_radius > _boxz) { throw std::runtime_error( "switching radius must not be larger than box length"); } - mmm1dgpu_real _far_switch_radius_2 = _far_switch_radius * _far_switch_radius; - mmm1dgpu_real _uz = 1.0f / _boxz; + float _far_switch_radius_2 = _far_switch_radius * _far_switch_radius; + float _uz = 1.0f / _boxz; for (int d = 0; d < deviceCount; d++) { // double colons are needed to access the constant memory variables because // they are file globals and we have identically named class variables @@ -351,18 +341,18 @@ void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz, if (_far_switch_radius >= 0) { mmm1d_params.far_switch_radius_2 = _far_switch_radius * _far_switch_radius; - cuda_safe_mem(cudaMemcpyToSymbol( - ::far_switch_radius_2, &_far_switch_radius_2, sizeof(mmm1dgpu_real))); + cuda_safe_mem(cudaMemcpyToSymbol(::far_switch_radius_2, + &_far_switch_radius_2, sizeof(float))); far_switch_radius = _far_switch_radius; } if (_boxz > 0) { host_boxz = _boxz; - cuda_safe_mem(cudaMemcpyToSymbol(::boxz, &_boxz, sizeof(mmm1dgpu_real))); - cuda_safe_mem(cudaMemcpyToSymbol(::uz, &_uz, sizeof(mmm1dgpu_real))); + cuda_safe_mem(cudaMemcpyToSymbol(::boxz, &_boxz, sizeof(float))); + cuda_safe_mem(cudaMemcpyToSymbol(::uz, &_uz, sizeof(float))); } if (_coulomb_prefactor != 0) { cuda_safe_mem(cudaMemcpyToSymbol(::coulomb_prefactor, &_coulomb_prefactor, - sizeof(mmm1dgpu_real))); + sizeof(float))); coulomb_prefactor = _coulomb_prefactor; } if (_bessel_cutoff > 0) { @@ -373,8 +363,8 @@ void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz, } if (_maxPWerror > 0) { mmm1d_params.maxPWerror = _maxPWerror; - cuda_safe_mem(cudaMemcpyToSymbol(::maxPWerror, &_maxPWerror, - sizeof(mmm1dgpu_real))); + cuda_safe_mem( + cudaMemcpyToSymbol(::maxPWerror, &_maxPWerror, sizeof(float))); maxPWerror = _maxPWerror; } } @@ -388,24 +378,24 @@ void Mmm1dgpuForce::set_params(mmm1dgpu_real _boxz, // the MPI loop on the slave nodes is waiting for broadcasts. } -__global__ void forcesKernel(const mmm1dgpu_real *__restrict__ r, - const mmm1dgpu_real *__restrict__ q, - mmm1dgpu_real *__restrict__ force, int N, - int pairs, int tStart, int tStop) { +__global__ void forcesKernel(const float *__restrict__ r, + const float *__restrict__ q, + float *__restrict__ force, int N, int pairs, + int tStart, int tStop) { if (tStop < 0) tStop = N * N; - const mmm1dgpu_real c_2pif = 2 * Utils::pi(); + const float c_2pif = 2 * Utils::pi(); for (int tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { int p1 = tid % N, p2 = tid / N; - mmm1dgpu_real x = r[3 * p2] - r[3 * p1], y = r[3 * p2 + 1] - r[3 * p1 + 1], - z = r[3 * p2 + 2] - r[3 * p1 + 2]; - mmm1dgpu_real rxy2 = sqpow(x) + sqpow(y); - mmm1dgpu_real rxy = sqrt(rxy2); - mmm1dgpu_real sum_r = 0, sum_z = 0; + float x = r[3 * p2] - r[3 * p1], y = r[3 * p2 + 1] - r[3 * p1 + 1], + z = r[3 * p2 + 2] - r[3 * p1 + 2]; + float rxy2 = sqpow(x) + sqpow(y); + float rxy = sqrt(rxy2); + float sum_r = 0, sum_z = 0; // if (*boxz <= 0.0) return; // in case we are not initialized yet @@ -418,16 +408,16 @@ __global__ void forcesKernel(const mmm1dgpu_real *__restrict__ r, // anyway) } else if (rxy2 <= *far_switch_radius_2) // near formula { - mmm1dgpu_real uzz = *uz * z; - mmm1dgpu_real uzr = *uz * rxy; + float uzz = *uz * z; + float uzr = *uz * rxy; sum_z = dev_mod_psi_odd(0, uzz); - mmm1dgpu_real uzrpow = uzr; + float uzrpow = uzr; for (int n = 1; n < *device_n_modPsi; n++) { - mmm1dgpu_real sum_r_old = sum_r; - mmm1dgpu_real mpe = dev_mod_psi_even(n, uzz); - mmm1dgpu_real mpo = dev_mod_psi_odd(n, uzz); + float sum_r_old = sum_r; + float mpe = dev_mod_psi_even(n, uzz); + float mpo = dev_mod_psi_odd(n, uzz); - sum_r += 2 * static_cast(n) * mpe * uzrpow; + sum_r += 2 * static_cast(n) * mpe * uzrpow; uzrpow *= uzr; sum_z += mpo * uzrpow; uzrpow *= uzr; @@ -456,18 +446,16 @@ __global__ void forcesKernel(const mmm1dgpu_real *__restrict__ r, } else // far formula { for (int p = 1; p < *bessel_cutoff; p++) { - mmm1dgpu_real arg = c_2pif * *uz * static_cast(p); - sum_r += - static_cast(p) * dev_K1(arg * rxy) * cos(arg * z); - sum_z += - static_cast(p) * dev_K0(arg * rxy) * sin(arg * z); + float arg = c_2pif * *uz * static_cast(p); + sum_r += static_cast(p) * dev_K1(arg * rxy) * cos(arg * z); + sum_z += static_cast(p) * dev_K0(arg * rxy) * sin(arg * z); } sum_r *= sqpow(*uz) * 4 * c_2pif; sum_z *= sqpow(*uz) * 4 * c_2pif; sum_r += 2 * *uz / rxy; } - mmm1dgpu_real pref = *coulomb_prefactor * q[p1] * q[p2]; + float pref = *coulomb_prefactor * q[p1] * q[p2]; if (pairs) { force[3 * (p1 + p2 * N - tStart)] = pref * sum_r / rxy * x; force[3 * (p1 + p2 * N - tStart) + 1] = pref * sum_r / rxy * y; @@ -480,17 +468,17 @@ __global__ void forcesKernel(const mmm1dgpu_real *__restrict__ r, } } -__global__ void energiesKernel(const mmm1dgpu_real *__restrict__ r, - const mmm1dgpu_real *__restrict__ q, - mmm1dgpu_real *__restrict__ energy, int N, - int pairs, int tStart, int tStop) { +__global__ void energiesKernel(const float *__restrict__ r, + const float *__restrict__ q, + float *__restrict__ energy, int N, int pairs, + int tStart, int tStop) { if (tStop < 0) tStop = N * N; - auto const c_2pif = 2 * Utils::pi(); - auto const c_gammaf = Utils::gamma(); + auto const c_2pif = 2 * Utils::pi(); + auto const c_gammaf = Utils::gamma(); - extern __shared__ mmm1dgpu_real partialsums[]; + extern __shared__ float partialsums[]; if (!pairs) { partialsums[threadIdx.x] = 0; __syncthreads(); @@ -499,11 +487,11 @@ __global__ void energiesKernel(const mmm1dgpu_real *__restrict__ r, static_cast(threadIdx.x + blockIdx.x * blockDim.x) + tStart; tid < tStop; tid += static_cast(blockDim.x * gridDim.x)) { int p1 = tid % N, p2 = tid / N; - mmm1dgpu_real z = r[3 * p2 + 2] - r[3 * p1 + 2]; - mmm1dgpu_real rxy2 = + float z = r[3 * p2 + 2] - r[3 * p1 + 2]; + float rxy2 = sqpow(r[3 * p2] - r[3 * p1]) + sqpow(r[3 * p2 + 1] - r[3 * p1 + 1]); - mmm1dgpu_real rxy = sqrt(rxy2); - mmm1dgpu_real sum_e = 0; + float rxy = sqrt(rxy2); + float sum_e = 0; // if (*boxz <= 0.0) return; // in case we are not initialized yet @@ -514,13 +502,13 @@ __global__ void energiesKernel(const mmm1dgpu_real *__restrict__ r, { } else if (rxy2 <= *far_switch_radius_2) // near formula { - mmm1dgpu_real uzz = *uz * z; - mmm1dgpu_real uzr2 = sqpow(*uz * rxy); - mmm1dgpu_real uzrpow = uzr2; + float uzz = *uz * z; + float uzr2 = sqpow(*uz * rxy); + float uzrpow = uzr2; sum_e = dev_mod_psi_even(0, uzz); for (int n = 1; n < *device_n_modPsi; n++) { - mmm1dgpu_real sum_e_old = sum_e; - mmm1dgpu_real mpe = dev_mod_psi_even(n, uzz); + float sum_e_old = sum_e; + float mpe = dev_mod_psi_even(n, uzz); sum_e += mpe * uzrpow; uzrpow *= uzr2; @@ -537,7 +525,7 @@ __global__ void energiesKernel(const mmm1dgpu_real *__restrict__ r, { sum_e = -(log(rxy * *uz / 2) + c_gammaf) / 2; for (int p = 1; p < *bessel_cutoff; p++) { - mmm1dgpu_real arg = c_2pif * *uz * static_cast(p); + float arg = c_2pif * *uz * static_cast(p); sum_e += dev_K0(arg * rxy) * cos(arg * z); } sum_e *= *uz * 4; @@ -554,9 +542,8 @@ __global__ void energiesKernel(const mmm1dgpu_real *__restrict__ r, } } -__global__ void vectorReductionKernel(mmm1dgpu_real const *src, - mmm1dgpu_real *dst, int N, int tStart, - int tStop) { +__global__ void vectorReductionKernel(float const *src, float *dst, int N, + int tStart, int tStop) { if (tStop < 0) tStop = N * N; @@ -598,8 +585,8 @@ void Mmm1dgpuForce::computeForces(SystemInterface &s) { } } -__global__ void scaleAndAddKernel(mmm1dgpu_real *dst, mmm1dgpu_real const *src, - int N, mmm1dgpu_real factor) { +__global__ void scaleAndAddKernel(float *dst, float const *src, int N, + float factor) { for (auto tid = static_cast(threadIdx.x + blockIdx.x * blockDim.x); tid < N; tid += static_cast(blockDim.x * gridDim.x)) { dst[tid] += src[tid] * factor; @@ -617,7 +604,7 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { if (pairs < 0) { throw std::runtime_error("MMM1D was not initialized correctly"); } - auto shared = static_cast(numThreads * sizeof(mmm1dgpu_real)); + auto shared = static_cast(numThreads * sizeof(float)); KERNELCALL_shared(energiesKernel, numBlocks(s), numThreads, shared, s.rGpuBegin(), s.qGpuBegin(), dev_energyBlocks, @@ -633,10 +620,10 @@ void Mmm1dgpuForce::computeEnergy(SystemInterface &s) { float Mmm1dgpuForce::force_benchmark(SystemInterface &s) { cudaEvent_t eventStart, eventStop; float elapsedTime; - mmm1dgpu_real *dev_f_benchmark; + float *dev_f_benchmark; - cuda_safe_mem(cudaMalloc((void **)&dev_f_benchmark, - 3 * s.npart_gpu() * sizeof(mmm1dgpu_real))); + cuda_safe_mem( + cudaMalloc((void **)&dev_f_benchmark, 3 * s.npart_gpu() * sizeof(float))); cuda_safe_mem(cudaEventCreate(&eventStart)); cuda_safe_mem(cudaEventCreate(&eventStop)); cuda_safe_mem(cudaEventRecord(eventStart, stream[0])); diff --git a/src/core/actor/specfunc_cuda.hpp b/src/core/actor/specfunc_cuda.hpp index b06925b2d68..1664d4e48e2 100644 --- a/src/core/actor/specfunc_cuda.hpp +++ b/src/core/actor/specfunc_cuda.hpp @@ -53,7 +53,7 @@ /** @name Chebyshev expansions based on SLATEC bk0(), bk0e() */ /**@{*/ -__constant__ static mmm1dgpu_real bk0_data[11] = { +__constant__ static float bk0_data[11] = { -.5f - 0.03532739323390276872f, 0.3442898999246284869f, 0.03597993651536150163f, 0.00126461541144692592f, 0.00002286212103119451f, 0.00000025347910790261f, @@ -62,7 +62,7 @@ __constant__ static mmm1dgpu_real bk0_data[11] = { 0.00000000000000000035f}; __constant__ static int bk0_size = 11; -__constant__ static mmm1dgpu_real ak0_data[17] = { +__constant__ static float ak0_data[17] = { 2.5f - 0.07643947903327941f, -0.02235652605699819f, 0.00077341811546938f, -0.00004281006688886f, 0.00000308170017386f, -0.00000026393672220f, 0.00000002563713036f, -0.00000000274270554f, 0.00000000031694296f, @@ -71,7 +71,7 @@ __constant__ static mmm1dgpu_real ak0_data[17] = { -0.00000000000000033f, 0.00000000000000005f}; __constant__ static int ak0_size = 16; -__constant__ static mmm1dgpu_real ak02_data[14] = { +__constant__ static float ak02_data[14] = { 2.5f - 0.01201869826307592f, -0.00917485269102569f, 0.00014445509317750f, -0.00000401361417543f, 0.00000015678318108f, -0.00000000777011043f, 0.00000000046111825f, -0.00000000003158592f, 0.00000000000243501f, @@ -82,7 +82,7 @@ __constant__ static int ak02_size = 13; /** @name Chebyshev expansions based on SLATEC besi0() */ /**@{*/ -__constant__ static mmm1dgpu_real bi0_data[12] = { +__constant__ static float bi0_data[12] = { 5.5f - .07660547252839144951f, 1.92733795399380827000f, .22826445869203013390f, .01304891466707290428f, .00043442709008164874f, .00000942265768600193f, @@ -94,7 +94,7 @@ __constant__ static int bi0_size = 12; /** @name Chebyshev expansions based on SLATEC besk1(), besk1e() */ /**@{*/ -__constant__ static mmm1dgpu_real bk1_data[11] = { +__constant__ static float bk1_data[11] = { 1.5f + 0.0253002273389477705f, -0.3531559607765448760f, -0.1226111808226571480f, -0.0069757238596398643f, -0.0001730288957513052f, -0.0000024334061415659f, @@ -103,7 +103,7 @@ __constant__ static mmm1dgpu_real bk1_data[11] = { -0.0000000000000000070f}; __constant__ static int bk1_size = 11; -__constant__ static mmm1dgpu_real ak1_data[17] = { +__constant__ static float ak1_data[17] = { 2.5f + 0.27443134069738830f, 0.07571989953199368f, -0.00144105155647540f, 0.00006650116955125f, -0.00000436998470952f, 0.00000035402774997f, -0.00000003311163779f, 0.00000000344597758f, -0.00000000038989323f, @@ -112,7 +112,7 @@ __constant__ static mmm1dgpu_real ak1_data[17] = { 0.00000000000000038f, -0.00000000000000006f}; __constant__ static int ak1_size = 17; -__constant__ static mmm1dgpu_real ak12_data[14] = { +__constant__ static float ak12_data[14] = { 2.5f + 0.06379308343739001f, 0.02832887813049721f, -0.00024753706739052f, 0.00000577197245160f, -0.00000020689392195f, 0.00000000973998344f, -0.00000000055853361f, 0.00000000003732996f, -0.00000000000282505f, @@ -123,7 +123,7 @@ __constant__ static int ak12_size = 14; /** @name Chebyshev expansions based on SLATEC besi1(), besi1e() */ /**@{*/ -__constant__ static mmm1dgpu_real bi1_data[11] = { +__constant__ static float bi1_data[11] = { 1.75f - 0.001971713261099859f, 0.407348876675464810f, 0.034838994299959456f, 0.001545394556300123f, 0.000041888521098377f, 0.000000764902676483f, 0.000000010042493924f, 0.000000000099322077f, 0.000000000000766380f, @@ -131,52 +131,50 @@ __constant__ static mmm1dgpu_real bi1_data[11] = { __constant__ static int bi1_size = 11; /**@}*/ -__device__ mmm1dgpu_real evaluateAsChebychevSeriesAt(mmm1dgpu_real const *c, - int n, mmm1dgpu_real x) { - mmm1dgpu_real x2 = 2 * x; - mmm1dgpu_real dd = c[n - 1]; - mmm1dgpu_real d = x2 * dd + c[n - 2]; +__device__ float evaluateAsChebychevSeriesAt(float const *c, int n, float x) { + float x2 = 2 * x; + float dd = c[n - 1]; + float d = x2 * dd + c[n - 2]; for (int j = n - 3; j >= 1; j--) { - mmm1dgpu_real tmp = d; + float tmp = d; d = x2 * d - dd + c[j]; dd = tmp; } return x * d - dd + c[0] / 2; } -__device__ mmm1dgpu_real evaluateAsTaylorSeriesAt(mmm1dgpu_real const *c, int n, - mmm1dgpu_real x) { +__device__ float evaluateAsTaylorSeriesAt(float const *c, int n, float x) { int cnt = n - 1; - mmm1dgpu_real r = c[cnt]; + float r = c[cnt]; while (--cnt >= 0) r = r * x + c[cnt]; return r; } -__device__ mmm1dgpu_real dev_K0(mmm1dgpu_real x) { - mmm1dgpu_real c = evaluateAsChebychevSeriesAt( +__device__ float dev_K0(float x) { + float c = evaluateAsChebychevSeriesAt( x <= 2 ? bk0_data : x <= 8 ? ak0_data : ak02_data, x <= 2 ? bk0_size : x <= 8 ? ak0_size : ak02_size, x <= 2 ? x * x / 2 - 1.0f : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f)); if (x <= 2) { - mmm1dgpu_real I0 = + float I0 = evaluateAsChebychevSeriesAt(bi0_data, bi0_size, x * x / 4.5f - 1.0f); - return (-log(x) + Utils::ln_2()) * I0 + c; + return (-log(x) + Utils::ln_2()) * I0 + c; } return exp(-x) * c * rsqrt(x); } -__device__ mmm1dgpu_real dev_K1(mmm1dgpu_real x) { - mmm1dgpu_real c = evaluateAsChebychevSeriesAt( +__device__ float dev_K1(float x) { + float c = evaluateAsChebychevSeriesAt( x <= 2 ? bk1_data : x <= 8 ? ak1_data : ak12_data, x <= 2 ? bk1_size : x <= 8 ? ak1_size : ak12_size, x <= 2 ? x * x / 2 - 1.0f : x <= 8 ? (16 / x - 5.0f) / 3.0f : (16 / x - 1.0f)); if (x <= 2) { - mmm1dgpu_real I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size, - x * x / 4.5f - 1.0f); - return (log(x) - Utils::ln_2()) * I1 + c / x; + float I1 = x * evaluateAsChebychevSeriesAt(bi1_data, bi1_size, + x * x / 4.5f - 1.0f); + return (log(x) - Utils::ln_2()) * I1 + c / x; } return exp(-x) * c * rsqrt(x); } diff --git a/src/python/espressomd/electrostatics.pxd b/src/python/espressomd/electrostatics.pxd index 7f9d5e90651..21faed2c243 100644 --- a/src/python/espressomd/electrostatics.pxd +++ b/src/python/espressomd/electrostatics.pxd @@ -138,28 +138,27 @@ IF ELECTROSTATICS: IF ELECTROSTATICS and MMM1D_GPU: cdef extern from "actor/Mmm1dgpuForce.hpp": - ctypedef float mmm1dgpu_real cdef cppclass Mmm1dgpuForce: - Mmm1dgpuForce(SystemInterface & s, mmm1dgpu_real coulomb_prefactor, mmm1dgpu_real maxPWerror, mmm1dgpu_real far_switch_radius, int bessel_cutoff) except+ - Mmm1dgpuForce(SystemInterface & s, mmm1dgpu_real coulomb_prefactor, mmm1dgpu_real maxPWerror, mmm1dgpu_real far_switch_radius) except+ - Mmm1dgpuForce(SystemInterface & s, mmm1dgpu_real coulomb_prefactor, mmm1dgpu_real maxPWerror) except+ + Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius, int bessel_cutoff) except+ + Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror, float far_switch_radius) except+ + Mmm1dgpuForce(SystemInterface & s, float coulomb_prefactor, float maxPWerror) except+ void setup(SystemInterface & s) - void tune(SystemInterface & s, mmm1dgpu_real _maxPWerror, mmm1dgpu_real _far_switch_radius, int _bessel_cutoff) - void set_params(mmm1dgpu_real _boxz, mmm1dgpu_real _coulomb_prefactor, mmm1dgpu_real _maxPWerror, mmm1dgpu_real _far_switch_radius, int _bessel_cutoff, bool manual) - void set_params(mmm1dgpu_real _boxz, mmm1dgpu_real _coulomb_prefactor, mmm1dgpu_real _maxPWerror, mmm1dgpu_real _far_switch_radius, int _bessel_cutoff) + void tune(SystemInterface & s, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff) + void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff, bool manual) + void set_params(float _boxz, float _coulomb_prefactor, float _maxPWerror, float _far_switch_radius, int _bessel_cutoff) unsigned int numThreads unsigned int numBlocks(SystemInterface & s) - mmm1dgpu_real host_boxz + float host_boxz int host_npart bool need_tune int pairs - mmm1dgpu_real * dev_forcePairs - mmm1dgpu_real * dev_energyBlocks + float * dev_forcePairs + float * dev_energyBlocks - mmm1dgpu_real coulomb_prefactor, maxPWerror, far_switch_radius + float coulomb_prefactor, maxPWerror, far_switch_radius int bessel_cutoff float force_benchmark(SystemInterface & s) diff --git a/src/python/espressomd/electrostatics.pyx b/src/python/espressomd/electrostatics.pyx index f1af5f2dcbe..43b4c24aabb 100644 --- a/src/python/espressomd/electrostatics.pyx +++ b/src/python/espressomd/electrostatics.pyx @@ -683,7 +683,7 @@ IF ELECTROSTATICS and MMM1D_GPU: default_params = self.default_params() self.thisptr.set_params( - < mmm1dgpu_real > box_geo.length()[2], < mmm1dgpu_real > coulomb.prefactor, + < float > box_geo.length()[2], < float > coulomb.prefactor, self._params["maxPWerror"], self._params["far_switch_radius"], self._params["bessel_cutoff"])