Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU performance improvements #488

Merged
merged 50 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from 43 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
45333fa
basic benchmarks
DiamonDinoia Jul 3, 2024
b95a082
added plotting script
DiamonDinoia Jul 4, 2024
ae55ca5
optimised plotting
DiamonDinoia Jul 8, 2024
16e27f0
fixed plotting and metrics
DiamonDinoia Jul 8, 2024
49d1f21
fixed the plot script
DiamonDinoia Jul 8, 2024
2fdae68
bin_size_x is as function of the shared memory available
DiamonDinoia Jul 8, 2024
c0d9923
bin_size_x is as function of the shared memory available
DiamonDinoia Jul 8, 2024
907797c
minor optimizations in 1D
DiamonDinoia Jul 9, 2024
60f4780
otpimized nupts driven
DiamonDinoia Jul 12, 2024
35dcc66
Optimized 1D and 2D
DiamonDinoia Jul 15, 2024
e1ad9bb
Merge branch 'master' into gpu-optimizations
DiamonDinoia Jul 15, 2024
366295d
3D integer operations
DiamonDinoia Jul 18, 2024
24bf6be
3D SM and GM optimized
DiamonDinoia Jul 18, 2024
960117a
bump cuda version
DiamonDinoia Jul 18, 2024
4295a86
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Jul 23, 2024
c1b14c6
changed matlab to generate necessary cuda upsampfact files
DiamonDinoia Jul 23, 2024
f300d2d
added new coeffs
DiamonDinoia Jul 23, 2024
e86c762
Merge remote-tracking branch 'refs/remotes/origin/gpu-optimizations' …
DiamonDinoia Jul 23, 2024
db0457a
restoring .m from master
DiamonDinoia Jul 23, 2024
d0ce11e
updated hook
DiamonDinoia Jul 23, 2024
513ce4b
updated matlab upsampfact
DiamonDinoia Jul 23, 2024
798717d
updated coefficients
DiamonDinoia Jul 23, 2024
282baf5
new coeffs
DiamonDinoia Jul 23, 2024
12822a2
updated cufinufft to new coeff
DiamonDinoia Jul 23, 2024
badf22f
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Jul 23, 2024
bf6328b
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Jul 23, 2024
ae783da
picked good defaults for method
DiamonDinoia Jul 24, 2024
d29fcf5
update configuration
DiamonDinoia Jul 24, 2024
73f937b
upated build system
DiamonDinoia Jul 25, 2024
0724866
fixing jenkins
DiamonDinoia Jul 25, 2024
8cd50fc
using cuda 11.2
DiamonDinoia Jul 25, 2024
49a9d7e
using sm90 atomics
DiamonDinoia Jul 25, 2024
041a536
updated script
DiamonDinoia Jul 25, 2024
54683c3
fixed bin sizes
DiamonDinoia Jul 26, 2024
4f19103
Merge branch 'master' into gpu-optimizations
DiamonDinoia Jul 26, 2024
dc3a628
using floor in fold_rescale updated changelog
DiamonDinoia Jul 26, 2024
b3237f7
fixed a mistake
DiamonDinoia Jul 26, 2024
db80aad
added comments for review
DiamonDinoia Jul 26, 2024
c225fb5
fixing review comments
DiamonDinoia Jul 31, 2024
394550f
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Jul 31, 2024
5606aa0
merged master
DiamonDinoia Jul 31, 2024
74ccd71
fixed cmake
DiamonDinoia Jul 31, 2024
ee28d05
Gcc-9 fixes; Ker size fixed too
DiamonDinoia Aug 1, 2024
466ddff
windows compatibility tweak; unit testing the 1.25 upsampfact
DiamonDinoia Aug 1, 2024
3f60ca4
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Aug 1, 2024
fb48ff8
added forgotten c++17 flag
DiamonDinoia Aug 1, 2024
5d7e276
Merge remote-tracking branch 'flatiron/master' into gpu-optimizations
DiamonDinoia Aug 2, 2024
afabb3f
Addressing review comments
DiamonDinoia Aug 2, 2024
c3df5e1
Added warning
DiamonDinoia Aug 2, 2024
44c523b
updated changelog
DiamonDinoia Aug 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ V 2.3.0beta (7/24/24)
Added support for Windows (llvm, msvc), Linux (llvm, gcc) and MacOS (llvm, gcc).
* cmake support for both ducc0 and fftw
* cmake adding nvcc and msvc optimization flags
* cuFINUFFT binsize is now a function of the shared memory available where
possible.
* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
* cuFINUFFT using the new normalized Horner coefficients and added support
for 1.25.
* cuFINUFFT new compile flags for extra-vectorization, flushing single
precision denormals to 0 and using fma where possible.
* cuFINUFFT using intrinsics in foldrescale and other places to increase
performance
* cuFINUFFT using SM90 float2 vector atomicAdd where supported

V 2.2.0 (12/12/23)

Expand Down
24 changes: 13 additions & 11 deletions devel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@ project(finufft_devel)
# Set the minimum required version of CMake
cmake_minimum_required(VERSION 3.5)


# include cpm cmake, downloading it
CPMAddPackage(
NAME benchmark
GITHUB_REPOSITORY google/benchmark
VERSION 1.8.3
OPTIONS "BENCHMARK_ENABLE_TESTING OFF"

)
cpmaddpackage(
NAME
benchmark
GITHUB_REPOSITORY
google/benchmark
VERSION
1.8.3
OPTIONS
"BENCHMARK_ENABLE_TESTING OFF")

if (benchmark_ADDED)
# patch benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
if(benchmark_ADDED)
# patch benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
endif()

add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark xsimd)
add_executable(padding padding.cpp)
target_compile_features(padding PRIVATE cxx_std_17)
target_link_libraries(padding finufft xsimd)
target_compile_options(padding PRIVATE -march=native)
12 changes: 6 additions & 6 deletions devel/gen_all_horner_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@

for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4)
fprintf('upsampfac = %g...\n',upsampfac)

ws = 2:16;
opts.wpad = true; % pad kernel eval to multiple of 4
opts.wpad = false; % pad kernel eval to multiple of 4

if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
janden marked this conversation as resolved.
Show resolved Hide resolved
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
end
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
Expand All @@ -27,9 +27,9 @@
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
if j==1 % write switch statement
fwrite(fid,sprintf(' if constexpr(w==%d) {\n',w));
fwrite(fid,sprintf(' if (w==%d) {\n',w));
janden marked this conversation as resolved.
Show resolved Hide resolved
else
fwrite(fid,sprintf(' } else if constexpr(w==%d) {\n',w));
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
end
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
end
Expand Down
4 changes: 2 additions & 2 deletions devel/gen_ker_horner_loop_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
width = w;
end
for n=1:d+1 % loop over poly coeff powers
s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
for i=2:width % loop over segments
s = sprintf('%s, %.16E', s, C(n,i));
s = sprintf('%s, %.16E', s, C(n,i));
end
str{n} = [s sprintf('};\n')];
end
Expand Down
3 changes: 3 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,23 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)

foreach(EXAMPLE ${EXAMPLES})
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
target_link_libraries(${EXAMPLE} PRIVATE finufft)
enable_asan(${EXAMPLE})
endforeach()

foreach(EXAMPLE ${EXAMPLES_C})
add_executable(${EXAMPLE} ${EXAMPLE}.c)
target_link_libraries(${EXAMPLE} PRIVATE finufft)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
enable_asan(${EXAMPLE})
endforeach()

if(FINUFFT_USE_OPENMP)
foreach(EXAMPLE ${EXAMPLES_OPENMP})
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
enable_asan(${EXAMPLE})
endforeach()
endif()
2 changes: 1 addition & 1 deletion examples/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

file(GLOB example_src "*.cpp")

foreach(srcfile ${example_src})
Expand All @@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
add_executable(${executable} ${srcfile})
target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(${executable} cufinufft)
target_compile_features(${executable} PRIVATE cxx_std_17)
endforeach()
32 changes: 32 additions & 0 deletions include/cufinufft/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,38 @@ template<typename T>
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
T *fwkerhalf, finufft_spread_opts opts);

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z);

template<typename T>
void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);

template<typename T, typename V>
auto cufinufft_set_shared_memory(V *kernel, const int dim,
const cufinufft_plan_t<T> &d_plan) {
int device_id;
cudaGetDevice(&device_id);
const auto shared_mem_required =
shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
int shared_mem_per_block{};
const auto err = cudaDeviceGetAttribute(
&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
if (err != cudaSuccess) {
return err;
}
if (shared_mem_required > shared_mem_per_block) {
fprintf(stderr,
"Error: Shared memory required per block is %zu bytes, but the device "
"supports only %d bytes.\n",
shared_mem_required, shared_mem_per_block);
return err;
}
return cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
shared_mem_required);
}

} // namespace common
} // namespace cufinufft
#endif
15 changes: 8 additions & 7 deletions include/cufinufft/contrib/helper_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
}

#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
__FILE__, __LINE__); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
}

#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \
Expand Down
Loading
Loading