diff --git a/CHANGELOG b/CHANGELOG index c882f716f..cf842bb67 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -54,6 +54,17 @@ V 2.3.0beta (7/24/24) * cmake adding nvcc and msvc optimization flags * cmake supports sphinx * updated install docs +* cuFINUFFT binsize is now a function of the shared memory available where + possible. +* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort. +* cuFINUFFT using the new normalized Horner coefficients and added support + for 1.25. +* cuFINUFFT new compile flags for extra-vectorization, flushing single + precision denormals to 0 and using fma where possible. +* cuFINUFFT using intrinsics in foldrescale and other places to increase + performance +* cuFINUFFT using SM90 float2 vector atomicAdd where supported +* cuFINUFFT making default binsize = 0 V 2.2.0 (12/12/23) diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt index 9a376408e..45b9a5989 100644 --- a/devel/CMakeLists.txt +++ b/devel/CMakeLists.txt @@ -2,23 +2,25 @@ project(finufft_devel) # Set the minimum required version of CMake cmake_minimum_required(VERSION 3.5) - # include cpm cmake, downloading it -CPMAddPackage( - NAME benchmark - GITHUB_REPOSITORY google/benchmark - VERSION 1.8.3 - OPTIONS "BENCHMARK_ENABLE_TESTING OFF" - -) +cpmaddpackage( + NAME + benchmark + GITHUB_REPOSITORY + google/benchmark + VERSION + 1.8.3 + OPTIONS + "BENCHMARK_ENABLE_TESTING OFF") -if (benchmark_ADDED) - # patch benchmark target - set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) +if(benchmark_ADDED) + # patch benchmark target + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) endif() add_executable(foldrescale foldrescale.cpp) target_link_libraries(foldrescale finufft benchmark xsimd) add_executable(padding padding.cpp) +target_compile_features(padding PRIVATE cxx_std_17) target_link_libraries(padding finufft xsimd) target_compile_options(padding PRIVATE -march=native) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 009e05ea4..51aa4e4e1 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -12,12 +12,12 @@ for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4) fprintf('upsampfac = %g...\n',upsampfac) - + ws = 2:16; - opts.wpad = true; % pad kernel eval to multiple of 4 + opts.wpad = false; % pad kernel eval to multiple of 4 - if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w'); - else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w'); + if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w'); + else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); @@ -27,9 +27,9 @@ fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta); str = gen_ker_horner_loop_C_code(w,d,beta,opts); if j==1 % write switch statement - fwrite(fid,sprintf(' if constexpr(w==%d) {\n',w)); + fwrite(fid,sprintf(' if (w==%d) {\n',w)); else - fwrite(fid,sprintf(' } else if constexpr(w==%d) {\n',w)); + fwrite(fid,sprintf(' } else if (w==%d) {\n',w)); end for i=1:numel(str); fwrite(fid,[' ',str{i}]); end end diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m index e2dd1b75a..059b6a4e1 100644 --- a/devel/gen_ker_horner_loop_C_code.m +++ b/devel/gen_ker_horner_loop_C_code.m @@ -38,9 +38,9 @@ width = w; end for n=1:d+1 % loop over poly coeff powers - s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1)); + s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1)); for i=2:width % loop over segments - s = sprintf('%s, %.16E', s, C(n,i)); + s = sprintf('%s, %.16E', s, C(n,i)); end str{n} = [s sprintf('};\n')]; end diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 35ac5662c..27b193cd5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf) foreach(EXAMPLE ${EXAMPLES}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) target_link_libraries(${EXAMPLE} PRIVATE finufft) enable_asan(${EXAMPLE}) endforeach() @@ -18,6 +19,7 @@ endforeach() foreach(EXAMPLE ${EXAMPLES_C}) add_executable(${EXAMPLE} ${EXAMPLE}.c) target_link_libraries(${EXAMPLE} PRIVATE finufft) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() @@ -25,6 +27,7 @@ if(FINUFFT_USE_OPENMP) foreach(EXAMPLE ${EXAMPLES_OPENMP}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() endif() diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index 0c9dba361..b9742a865 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -1,4 +1,3 @@ - file(GLOB example_src "*.cpp") foreach(srcfile ${example_src}) @@ -7,4 +6,5 @@ foreach(srcfile ${example_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} cufinufft) + target_compile_features(${executable} PRIVATE cxx_std_17) endforeach() diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 7bddc188e..efa7eb7b1 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,38 @@ template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, finufft_spread_opts opts); +template +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z); + +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts); + +template +auto cufinufft_set_shared_memory(V *kernel, const int dim, + const cufinufft_plan_t &d_plan) { + /** + * WARNING: this function does not handle cuda errors. The caller should check them. + */ + int device_id{}, shared_mem_per_block{}; + cudaGetDevice(&device_id); + const auto shared_mem_required = + shared_memory_required(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, + d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez); + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, + device_id); + if (shared_mem_required > shared_mem_per_block) { + fprintf(stderr, + "Error: Shared memory required per block is %zu bytes, but the device " + "supports only %d bytes.\n", + shared_mem_required, shared_mem_per_block); + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_mem_required); + return 0; +} + } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h index 3dade898e..c3a31bd2b 100644 --- a/include/cufinufft/contrib/helper_cuda.h +++ b/include/cufinufft/contrib/helper_cuda.h @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream, return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr); } -#define RETURN_IF_CUDA_ERROR \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (err != cudaSuccess) { \ - printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \ - return FINUFFT_ERR_CUDA_FAILURE; \ - } \ +#define RETURN_IF_CUDA_ERROR \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \ + __FILE__, __LINE__); \ + return FINUFFT_ERR_CUDA_FAILURE; \ + } \ } #define CUDA_FREE_AND_NULL(val, stream, pool_supported) \ diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 32f2cff00..1f4c59e2a 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -1,216 +1,205 @@ // Code generated by gen_all_horner_C_code.m in finufft/devel // Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) 2018, The Simons Foundation, Inc. +// (C) The Simons Foundation, Inc. if (w==2) { - CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; - CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; - CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; - CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; - CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; - CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; + constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); } else if (w==3) { - CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; - CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; - CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; - CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; - CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; - CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; - CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; + constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; + constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; + constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; + constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==4) { - CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; - CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; - CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; - CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; - CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; - CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; - CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; - CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; + constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; + constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; + constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; + constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; + constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==5) { - CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; - CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; - CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; - CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; - CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; - CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; - CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; - CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; - CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; + constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; + constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; + constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; + constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; + constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; + constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==6) { - CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; - CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; - CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; - CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; - CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; - CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; - CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; - CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; - CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; - CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; + constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; + constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; + constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; + constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; + constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; + constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; + constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==7) { - CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; - CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; - CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; - CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; - CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; - CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; - CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; - CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; - CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; - CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; - CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; + constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; + constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; + constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; + constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; + constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; + constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; + constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; + constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; + constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==8) { - CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; - CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; - CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; - CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; - CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; - CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; - CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; - CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; - CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; - CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; - CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; + constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; + constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; + constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; + constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; + constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; + constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; + constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; + constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==9) { - CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; - CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; - CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; - CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; - CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; - CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; - CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; - CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; - CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; - CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; - CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; - CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; + constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; + constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; + constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; + constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; + constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; + constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; + constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; + constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==10) { - CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; - CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; - CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; - CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; - CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; - CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; - CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; - CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; - CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; - CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; - CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; - CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; - CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; + constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; + constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; + constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; + constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; + constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; + constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; + constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; + constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; + constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; + constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; + constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==11) { - CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; - CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; - CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; - CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; - CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; - CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; - CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; - CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; - CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; - CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; - CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; - CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; - CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; - CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; + constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; + constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; + constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; + constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; + constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; + constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; + constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; + constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; + constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; + constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; + constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; + constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==12) { - CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; - CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; - CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; - CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; - CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; - CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; - CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; - CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; - CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; - CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; - CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; - CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; - CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; - CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; + constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; + constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; + constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; + constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; + constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; + constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; + constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; + constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; + constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; + constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; + constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; - CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; - CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; - CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; - CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; - CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; - CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; - CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; - CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; - CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; - CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; - CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; - CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; - CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; - CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; + constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; + constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; + constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; + constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; + constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; + constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; + constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; + constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; + constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; + constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; + constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; + constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; + constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; + constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; - CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; - CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; - CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; - CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; - CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; - CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; - CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; - CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; - CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; - CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; - CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; - CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; - CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; - CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; + constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; + constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; + constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; + constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; + constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; + constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; + constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; + constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; + constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; + constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; + constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; + constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; + constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; + constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; - CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; - CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; - CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; - CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; - CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; - CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; - CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; - CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; - CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; - CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; - CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; - CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; - CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; - CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; - CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; - CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; + constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; + constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; + constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; + constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; + constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; + constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; + constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; + constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; + constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; + constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; + constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; + constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; + constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; + constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; + constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; + constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; - CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; - CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; - CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; - CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; - CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; - CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; - CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; - CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; - CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; - CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; - CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; - CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; - CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; - CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; - CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; - CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; - CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; + constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; + constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; + constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; + constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; + constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; + constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; + constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; + constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; + constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; + constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; + constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; + constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; + constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; + constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; + constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; + constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; + constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc new file mode 100644 index 000000000..e2fa229b7 --- /dev/null +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -0,0 +1,171 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; + constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; + constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; + constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; + constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; + constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; + constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; + constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==4) { + constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; + constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; + constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; + constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; + constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; + constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==5) { + constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; + constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; + constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; + constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; + constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; + constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==6) { + constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; + constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; + constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; + constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; + constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; + constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==7) { + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; + constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; + constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; + constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; + constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; + constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; + constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; + constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==8) { + constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; + constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; + constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; + constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; + constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; + constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; + constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; + constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; + constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; + constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; + constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; + constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; + constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==10) { + constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; + constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; + constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; + constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; + constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; + constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; + constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; + constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; + constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; + constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==11) { + constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; + constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; + constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; + constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; + constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; + constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; + constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; + constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; + constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; + constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==12) { + constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; + constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; + constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; + constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; + constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; + constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; + constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; + constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; + constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; + constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==13) { + constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; + constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; + constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; + constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; + constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; + constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; + constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; + constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; + constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; + constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; + constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; + constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; + constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; + constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; + constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; + constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; + constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; + constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; + constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; + constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; + constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; + constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; + constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; + constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; + constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; + constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; + constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; + constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; + constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; + constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; + constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; + constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; + constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; + constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; + constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; + constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; + constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; + constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; + constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; + constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; + constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; + constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; + constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; + constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; + constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; + constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 826319516..3a9fd6877 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -39,39 +39,6 @@ template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { - switch (dim) { - case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; - opts->gpu_binsizez = 1; - } break; - case 3: { - switch (opts->gpu_method) { - case 1: - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; - } break; - case 4: { - opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; - opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; - opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; - } break; - } - } break; - } -} - template int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol, cufinufft_plan_t **d_plan_ptr, cufinufft_opts *opts) { @@ -93,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran Variables and arrays inside the plan struct are set and allocated. Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. + Marco Barbone 07/26/24. Using SM when shared memory available is enough. */ int ier; cuDoubleComplex *d_a = nullptr; // fseries temp data @@ -109,17 +77,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } // Mult-GPU support: set the CUDA Device ID: - const int device_id = opts == NULL ? 0 : opts->gpu_device_id; + const int device_id = opts == nullptr ? 0 : opts->gpu_device_id; cufinufft::utils::WithCudaDevice device_swapper(device_id); /* allocate the plan structure, assign address to user pointer. */ - cufinufft_plan_t *d_plan = new cufinufft_plan_t; - *d_plan_ptr = d_plan; + auto *d_plan = new cufinufft_plan_t; + *d_plan_ptr = d_plan; // Zero out your struct, (sets all pointers to NULL) memset(d_plan, 0, sizeof(*d_plan)); - /* If a user has not supplied their own options, assign defaults for them. */ - if (opts == NULL) { // use default opts + if (opts == nullptr) { // use default opts cufinufft_default_opts(&(d_plan->opts)); } else { // or read from what's passed in d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect @@ -138,26 +105,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster. - * However, in the special case of _double precision_ in _three dimensions_ - * with more than _three digits of precision_, there is note enough shared - * memory for this to work. As a result, we will default to method 1 (GM) in - * this special case. - * - * For type 2, we always default to method 1 (GM). */ - if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3)) - d_plan->opts.gpu_method = 2; - else if (type == 1 && tol < 1e-3) - d_plan->opts.gpu_method = 1; - else if (type == 2) - d_plan->opts.gpu_method = 1; - } - - /* Setup Spreader */ using namespace cufinufft::common; + /* Setup Spreader */ + // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { delete *d_plan_ptr; @@ -170,7 +120,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; - cufinufft_setup_binsize(type, dim, &d_plan->opts); + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); + RETURN_IF_CUDA_ERROR + CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); @@ -180,6 +132,37 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran if (dim > 2) set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster + * if there is enough shared memory available. Otherwise, we default to GM. + * + * For type 2, we always default to method 1 (GM). + */ + if (type == 2) { + d_plan->opts.gpu_method = 1; + } else { + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } + } + } + int fftsign = (iflag >= 0) ? 1 : -1; d_plan->nf1 = nf1; diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index da1c59930..2963d381d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -2,16 +2,54 @@ #define __CUSPREADINTERP_H__ #include +#include #include #include namespace cufinufft { namespace spreadinterp { -template static __forceinline__ __device__ T fold_rescale(T x, int N) { - static constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - const T result = x * x2pi + T(0.5); - return (result - floor(result)) * T(N); +template +static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { + if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + return __fmaf_rn(a, b, c); + } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + return __fma_rn(a, b, c); + } + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + return T{0}; +} + +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { + constexpr auto x2pi = T(0.159154943091895345554011992339482617); + constexpr auto half = T(0.5); +#if defined(__CUDA_ARCH__) + if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + auto result = __fmaf_rn(x, x2pi, half); + // subtract, round down + result = __fsub_rd(result, floorf(result)); + // multiply, round down + return __fmul_rd(result, static_cast(N)); + } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even + auto result = __fma_rn(x, x2pi, half); + // subtract, round down + result = __dsub_rd(result, floor(result)); + // multiply, round down + return __dmul_rd(result, static_cast(N)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } +#else + const auto result = std::fma(x, x2pi, half); + return (result - std::floor(result)) * static_cast(N); +#endif } template @@ -22,11 +60,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) approximation to prolate spheroidal wavefunction (PSWF) of order 0. This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - if (abs(x) >= opts.ES_halfwidth) + if (abs(x) >= T(opts.ES_halfwidth)) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x)); + return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } template @@ -41,7 +79,9 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0; + return abs(x) < ns / T(2.0) + ? exp((T)es_beta * (sqrt((T)1.0 - (T)es_c * x * x) - (T)1.0)) + : 0.0; } template @@ -52,13 +92,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { - T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1] + const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] + // T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here - using FLT = T; - using CUFINUFFT_FLT = T; + using FLT = T; #include "cufinufft/contrib/ker_horner_allw_loop.inc" } + if (upsampfac == 1.25) { // floating point equality is fine here + using FLT = T; +#include "cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc" + } } template diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 3455b99c0..4bfaa801d 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -12,6 +12,9 @@ #include +#include +#include + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) #else __inline__ __device__ double atomicAdd(double *address, double val) { @@ -68,6 +71,81 @@ template T infnorm(int n, std::complex *a) { } return sqrt(nrm); } + +#ifdef __CUDA_ARCH__ +__forceinline__ __device__ auto interval(const int ns, const float x) { + // float to int round up and fused multiply-add to round up + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); + // float to int round down and fused multiply-add to round down + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + return int2{xstart, xend}; +} +__forceinline__ __device__ auto interval(const int ns, const double x) { + // same as above + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); + return int2{xstart, xend}; +} +#endif + +// Define a macro to check if NVCC version is >= 11.3 +#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) +#if (__CUDACC_VER_MAJOR__ > 11) || \ + (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) + +#define ALLOCA_SUPPORTED 1 +// windows compatibility +#if __has_include() +#include +#endif +#else +#define ALLOCA_SUPPORTED 0 +#endif +#else +#define ALLOCA_SUPPORTED 0 +#endif + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 900 +#define COMPUTE_CAPABILITY_90_OR_HIGHER 1 +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif + +/** + * does a complex atomic add on a shared memory address + * it adds the real and imaginary parts separately + * cuda does not support atomic operations + * on complex numbers on shared memory directly + */ + +template +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { + const auto raw_address = reinterpret_cast(address); + atomicAdd(raw_address, res.x); + atomicAdd(raw_address + 1, res.y); +} + +/** + * does a complex atomic add on a global memory address + * since cuda 90 atomic operations on complex numbers + * on shared memory are supported so we leverage them + */ +template +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { + if constexpr ( + std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { + atomicAdd(address, res); + } else { + atomicAddComplexShared(address, res); + } +} + } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 5f1079fde..ec95760fb 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,4 +1,10 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) -#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) \ No newline at end of file +target_compile_features(cuperftest PRIVATE cxx_std_17) +set_target_properties( + cuperftest + PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py new file mode 100644 index 000000000..c22c2af9f --- /dev/null +++ b/perftest/cuda/bench.py @@ -0,0 +1,200 @@ +import matplotlib.pyplot as plt +import os +import subprocess +import pandas as pd +import numpy as np +import io +cwd = os.getcwd() + + +# function that runs a command line command and returns the output +# it also takes a list of arguments to pass to the command +def run_command(command, args): + # convert command and args to a string + try: + cmd = [command] + args + print("Running command:", ' '.join(cmd)) + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout, result.stderr + except subprocess.CalledProcessError as e: + print('stdout output:\n', e.stdout) + print('stderr output:\n', e.stderr) + print("Error executing command:", e) + + +# function that builds a string from a dictionary of arguments + +def build_args(args): + args_list = [] + for key, value in args.items(): + args_list.append(key) + args_list.append(value) + return args_list + + +# function + +# example command to run: +# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 +# example arguments +args = {"--prec": "f", + "--n_runs": "5", + "--method": "0", + "--sort": "1", + "--N1": "16777216", + # "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--kerevalmethod": "1", + "--M": "1E8", + "--tol": "1E-6"} +# iterate over tol from 1E-6 to 1E-1 + +warmup = {"--prec": "f", + "--n_runs": "1", + "--method": "0", + "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--M": "256", + "--tol": "1E-1"} +cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup) +print("Warmup") +stdout, stderr = run_command("nsys", cmd) +print("Benchmarking") +if stderr != '': + print(stderr) + exit(0) +for precision in ['d']: + print(f"precision: {precision}") + for dim in range(1, 2): + if dim == 1: + args["--N1"] = "16777216" + if dim == 2: + args["--N1"] = "256" + args["--N2"] = "256" + if dim == 3: + args["--N1"] = "256" + args["--N2"] = "256" + args["--N3"] = "256" + args["--prec"] = precision + max_range = 16 if args["--prec"] == "d" else 7 + if precision == 'd' and dim == 3: + max_range = 6 + print(f"dimensions {dim}") + data = { + 'method': [], + 'throughput': [], + 'tolerance': [], + # 'setpts': [], + 'exec': [], + } + for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + args["--method"] = method + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) + # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value + # print(f'setpts pts/s: {setpts}') + # print(f'exec pts/s: {exec}') + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + # print(f'total_fft: {total_fft}') + # drop all the rows with spread not in "Name" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # exit(0) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + # print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) + # pt/s + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) + df = pd.DataFrame(data) + # Pivot the DataFrame + pivot_df = df.pivot(index='tolerance', columns='method') + # print(pivot_df) + # scale the throughput SM by GM + # pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] + # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] + # scale setpts SM by GM + # pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] + # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] + # remove the GM column + # pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) + print(pivot_df) +exit(0) +# Plot +pivot_df.plot(kind='bar', figsize=(10, 7)) +# Find the minimum throughput value +min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) +max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) +print(min_val, max_val) +plt.ylim(min_val * .90, max_val * 1.1) +# plt.ylim(.8, 1.2) + +# Calculate the smallest power of 10 +# min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) + +# Adjust the plot's y-axis limits +# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin + +# plot an horizontal line at 1 with label "GM" +# plt.axhline(y=1, color='k', linestyle='--', label='GM') +plt.xlabel('Tolerance') +plt.ylabel('Throughput') +plt.title('Throughput by Tolerance and Method') +plt.legend(title='Method') +plt.tight_layout() +plt.show() +plt.xlabel("Tolerance") +plt.ylabel("Points/s") +plt.savefig("bench.png") +plt.savefig("bench.svg") +plt.savefig("bench.pdf") +plt.show() diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh new file mode 100644 index 000000000..9832e1088 --- /dev/null +++ b/perftest/cuda/bench.sh @@ -0,0 +1,13 @@ +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10 diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 26eaff491..a17b6f044 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -1,15 +1,11 @@ #include #include #include -#include -#include -#include -#include +#include #include #include -#include #include #include diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index cd3637c8b..2bf69f6a2 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -1,14 +1,10 @@ #include #include -#include #include -#include #include #include -using namespace cufinufft::memtransfer; - #include "spreadinterp1d.cuh" namespace cufinufft { diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 26fd5024c..1b2afde7d 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -1,12 +1,12 @@ #include #include -#include #include #include #include #include +#include #include #include #include @@ -15,6 +15,7 @@ using namespace cufinufft::common; using namespace cufinufft::memtransfer; #include "spreadinterp1d.cuh" +#include namespace cufinufft { namespace spreadinterp { @@ -50,10 +51,30 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) return ier; } +template struct cmp : public thrust::binary_function { + + cmp(const T *kx) : kx(kx) {} + + __host__ __device__ bool operator()(const int a, const int b) const { + return fold_rescale(kx[a], 1) < fold_rescale(kx[b], 1); + } + +private: + const T *kx; +}; + template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { auto &stream = d_plan->stream; - + if (d_plan->opts.gpu_sort && d_plan->opts.gpu_method == 1) { + int *d_idxnupts = d_plan->idxnupts; + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); + RETURN_IF_CUDA_ERROR + thrust::sort(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M, + cmp{d_plan->kx}); + RETURN_IF_CUDA_ERROR + return 0; + } if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { @@ -83,17 +104,16 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { thrust::device_ptr d_ptr(d_binsize); thrust::device_ptr d_result(d_binstartpts); thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); RETURN_IF_CUDA_ERROR } else { int *d_idxnupts = d_plan->idxnupts; - trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, - d_idxnupts); + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); RETURN_IF_CUDA_ERROR } - return 0; } @@ -133,7 +153,6 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks RETURN_IF_CUDA_ERROR } } - return 0; } @@ -145,33 +164,29 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) which only needs to be done once. */ { - auto &stream = d_plan->stream; - int ier; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - int bin_size_x = d_plan->opts.gpu_binsizex; + const auto maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + const auto bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n"; return FINUFFT_ERR_BINSIZE_NOTVALID; } - int numbins = ceil((T)nf1 / bin_size_x); - - T *d_kx = d_plan->kx; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + const auto numbins = (nf1 + bin_size_x - 1) / bin_size_x; + const auto d_kx = d_plan->kx; + const auto d_binsize = d_plan->binsize; + const auto d_binstartpts = d_plan->binstartpts; + const auto d_sortidx = d_plan->sortidx; + const auto d_numsubprob = d_plan->numsubprob; + const auto d_subprobstartpts = d_plan->subprobstartpts; + const auto d_idxnupts = d_plan->idxnupts; + const auto stream = d_plan->stream; int *d_subprob_to_bin = nullptr; - if ((ier = - checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream); + RETURN_IF_CUDA_ERROR calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx); RETURN_IF_CUDA_ERROR @@ -192,30 +207,25 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) d_ptr = thrust::device_pointer_cast(d_numsubprob); d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream); + RETURN_IF_CUDA_ERROR - int totalnumsubprob; - if ((ier = - checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], - sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; + int totalnumsubprob{}; + cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors( - cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, - d_plan->supports_pools)))) - return ier; + RETURN_IF_CUDA_ERROR + + cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, + d_plan->supports_pools); + RETURN_IF_CUDA_ERROR + map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>( d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } - - assert(d_subprob_to_bin != NULL); + RETURN_IF_CUDA_ERROR + assert(d_subprob_to_bin != nullptr); cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools); d_plan->subprob_to_bin = d_subprob_to_bin; d_plan->totalnumsubprob = totalnumsubprob; @@ -251,15 +261,18 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = - (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(1, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, @@ -268,6 +281,11 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) } } else { for (int t = 0; t < blksize; t++) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 24b4fb9d2..72c776c06 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -8,6 +8,9 @@ #include #include #include + +#include + using namespace cufinufft::utils; namespace cufinufft { @@ -15,164 +18,173 @@ namespace spreadinterp { /* ------------------------ 1d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *idxnupts) { - int xx, ix; - T ker1[MAX_NSPREAD]; - - T x_rescaled; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - cnow = c[idxnupts[i]]; - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue = ker1[xx - xstart]; - atomicAdd(&fw[ix].x, cnow.x * kervalue); - atomicAdd(&fw[ix].y, cnow.y * kervalue); - } +template +__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, + cuda_complex *fw, int M, int ns, int nf1, T es_c, + T es_beta, T sigma, const int *idxnupts) { + // dynamic stack allocation to reduce stack usage +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + + for (auto xx = xstart; xx <= xend; xx++) { + auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue = ker1[xx - xstart]; + atomicAdd(&fw[ix].x, cnow.x * kervalue); + atomicAdd(&fw[ix].y, cnow.y * kervalue); } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x, - int *sortidx) { - int binx; - int oldidx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - oldidx = atomicAdd(&bin_size[binx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx) { - sortidx[i] = -binx; - } +template +__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, + int *bin_size, const T *x, int *sortidx) { + int binx; + int oldidx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + oldidx = atomicAdd(&bin_size[binx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx) { + sortidx[i] = -binx; } + } } -template -__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts, - const int *sortidx, const T *x, int *index, int nf1) { - int binx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - index[bin_startpts[binx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_idx_1d( + int M, int bin_size_x, int nbinx, const int *bin_startpts, const int *sortidx, + const T *x, int *index, int nf1) { + int binx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + index[bin_startpts[binx] + sortidx[i]] = i; + } } -template -__global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, - int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, xend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; - } - __syncthreads(); - - T x_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); - atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); - } +template +__global__ void spread_1d_subprob( + const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, + T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, + int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + + const int subpidx = blockIdx.x; + const int bidx = subprob_to_bin[subpidx]; + const int binsubp_idx = subpidx - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + const int xoffset = (bidx % nbinx) * bin_size_x; + const auto ns_2 = (ns + 1) / 2; + const int N = bin_size_x + 2 * ns_2; + + // dynamic stack allocation +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + + const T ns_2f = ns * T(.5); + + __syncthreads(); + + for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = T(xstart + xoffset) - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + if (ix >= (bin_size_x + ns_2) || ix < 0) break; + const cuda_complex result{cnow.x * ker1[xx - xstart], + cnow.y * ker1[xx - xstart]}; + atomicAddComplexShared(fwshared + ix, result); } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - ix = xoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - atomicAdd(&fw[ix].x, fwshared[k].x); - atomicAdd(&fw[ix].y, fwshared[k].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + auto ix = xoffset - ns_2 + k; + if (ix < (nf1 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + atomicAddComplexGlobal(fw + ix, fwshared[k]); } + } } /* --------------------- 1d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, +template +__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, + const cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - T ker1[MAX_NSPREAD]; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[ix].x * kervalue1; - cnow.y += fw[ix].y * kervalue1; - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + // dynamic stack allocation +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto [xstart, xend] = interval(ns, x_rescaled); + + cuda_complex cnow{0, 0}; + + const T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + for (int xx = xstart; xx <= xend; xx++) { + int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue1 = ker1[xx - xstart]; + cnow.x += fw[ix].x * kervalue1; + cnow.y += fw[ix].y * kervalue1; } + c[idxnupts[i]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index afc801b7f..f7f7b1559 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -1,14 +1,10 @@ -#include +#include #include #include -#include -#include - #include #include #include -#include #include using namespace cufinufft::deconvolve; diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index 533788482..0d3d3ff9b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -1,13 +1,12 @@ -#include #include #include #include -#include +#include #include -using namespace cufinufft::memtransfer; +using namespace cufinufft::common; #include "spreadinterp2d.cuh" @@ -120,17 +119,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->opts.upsampfac; + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -140,6 +136,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 69b2ba956..80cf9f8e9 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -7,14 +6,13 @@ #include #include -#include +#include #include #include #include "spreadinterp2d.cuh" using namespace cufinufft::common; -using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { @@ -273,16 +271,17 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * - (bin_size_y + 2 * (int)ceil(ns / 2.0)) * - sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -292,6 +291,11 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 558984ea1..53a243e7e 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -15,314 +15,330 @@ namespace spreadinterp { /* ------------------------ 2d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xstart, ystart, xend, yend; - int xx, yy, ix, iy; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - T kervalue1, kervalue2; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - cnow = c[idxnupts[i]]; - - xstart = ceil(x_rescaled - ns / 2.0); - ystart = ceil(y_rescaled - ns / 2.0); - xend = floor(x_rescaled + ns / 2.0); - yend = floor(y_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (yy = ystart; yy <= yend; yy++) { - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - outidx = ix + iy * nf1; - kervalue1 = ker1[xx - xstart]; - kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_nupts_driven( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + const auto x1 = (T)xstart - x_rescaled; + const auto y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + for (auto yy = ystart; yy <= yend; yy++) { + for (auto xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto outidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + const auto kervalue2 = ker2[yy - ystart]; + const cuda_complex res{cnow.x * kervalue1 * kervalue2, + cnow.y * kervalue1 * kervalue2}; + atomicAddComplexGlobal(fw + outidx, res); + } + } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +template +__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, + int bin_size_y, int nbinx, int nbiny, int *bin_size, T *x, T *y, int *sortidx) { - int binidx, binx, biny; - int oldidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx || biny >= nbiny) { - sortidx[i] = -biny; - } + int binidx, binx, biny; + int oldidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx || biny >= nbiny) { + sortidx[i] = -biny; } + } } -template -__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, - const int *bin_startpts, const int *sortidx, const T *x, - const T *y, int *index, int nf1, int nf2) { - int binx, biny; - int binidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_2d( + int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts, + const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) { + int binx, biny; + int binidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; - } - __syncthreads(); - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - T kervalue2 = ker2[yy - ystart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_subprob( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + const int subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto cnow = c[idxnupts[idx]]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + for (int yy = ystart; yy <= yend; yy++) { + const auto iy = yy + ns_2; + if (iy >= (bin_size_y + rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); + } + } + } + + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + const auto i = k % (bin_size_x + rounded_ns); + const auto j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } + } } /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - int inidx = ix + iy * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2; - cnow.y += fw[inidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; +template +__global__ void interp_2d_nupts_driven( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + T x1 = (T)xstart - x_rescaled; + T y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + cuda_complex cnow{0, 0}; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto inidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2; + cnow.y += fw[inidx].y * kervalue1 * kervalue2; + } + } + c[idxnupts[i]] = cnow; + } } /* Kernels for Subprob Method */ -template -__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_2d_subprob( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + for (int k = threadIdx.x; k < N; k += blockDim.x) { + int i = k % (bin_size_x + rounded_ns); + int j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx] = fw[outidx]; } - __syncthreads(); - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - iy = yy + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[idx]] = cnow; + } + __syncthreads(); + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + } + + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto iy = yy + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; + } } + c[idxnupts[idx]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index ea0ef4a86..5977e6d5f 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -1,13 +1,10 @@ #include #include -#include -#include #include #include #include -#include #include #include diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index b42231d86..91379d3ae 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -1,15 +1,15 @@ -#include #include #include #include +#include "spreadinterp3d.cuh" +#include #include #include -#include "spreadinterp3d.cuh" - using namespace cufinufft::memtransfer; +using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { @@ -123,19 +123,16 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth == 1) { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -143,6 +140,7 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index fa67f95f8..475a888ac 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -7,11 +6,11 @@ #include #include -#include +#include #include #include + using namespace cufinufft::common; -using namespace cufinufft::memtransfer; #include "spreadinterp3d.cuh" @@ -530,20 +529,19 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int totalnumsubprob = d_plan->totalnumsubprob; int *d_subprob_to_bin = d_plan->subprob_to_bin; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" - << sharedplanorysize << ")" << std::endl; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -551,6 +549,11 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 838816a56..59b4661ff 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -4,554 +4,602 @@ #include #include +#include #include #include #include #include #include +using namespace cufinufft::utils; + namespace cufinufft { namespace spreadinterp { /* ---------------------- 3d Spreading Kernels -------------------------------*/ /* Kernels for bin sort NUpts */ -template -__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y, - int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x, +template +__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, + int bin_size_y, int bin_size_z, int nbinx, + int nbiny, int nbinz, int *bin_size, const T *x, const T *y, const T *z, int *sortidx) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = binx + biny * nbinx + binz * nbinx * nbiny; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = binx + biny * nbinx + binz * nbinx * nbiny; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, - int nbiny, int nbinz, const int *bin_startpts, const int *sortidx, - const T *x, const T *y, const T *z, int *index, - int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_3d( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny, + int nbinz, const int *bin_startpts, const int *sortidx, const T *x, const T *y, + const T *z, int *index, int nf1, int nf2, int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } /* Kernels for NUptsdriven method */ -template -__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *idxnupts) { - int xx, yy, zz, ix, iy, iz; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - T ker1val, ker2val, ker3val; - - T x_rescaled, y_rescaled, z_rescaled; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - T z1 = (T)zstart - z_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); - } +template +__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, + const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, + T es_beta, T sigma, const int *idxnupts) { +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (zz = zstart; zz <= zend; zz++) { - ker3val = ker3[zz - zstart]; - for (yy = ystart; yy <= yend; yy++) { - ker2val = ker2[yy - ystart]; - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - ker1val = ker1[xx - xstart]; - T kervalue = ker1val * ker2val * ker3val; - atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); - atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto ker3val = ker3[zz - zstart]; + for (int yy = ystart; yy <= yend; yy++) { + const auto ker2val = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const auto ker1val = ker1[xx - xstart]; + const auto kervalue = ker1val * ker2val * ker3val; + const cuda_complex res{c[idxnupts[i]].x * kervalue, + c[idxnupts[i]].y * kervalue}; + atomicAddComplexGlobal(fw + outidx, res); } + } } + } } /* Kernels for Subprob method */ -template -__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, - int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, - int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - int nbinz, int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - const int bidx = subprob_to_bin[blockIdx.x]; - const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; - const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - const int xoffset = (bidx % nbinx) * bin_size_x; - const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_subprob( + T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, + int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, + int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, + int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + int nbinz, int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + + const int bidx = subprob_to_bin[blockIdx.x]; + const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int nuptsidx = idxnupts[ptstart + i]; + const auto x_rescaled = fold_rescale(x[nuptsidx], nf1); + const auto y_rescaled = fold_rescale(y[nuptsidx], nf2); + const auto z_rescaled = fold_rescale(z[nuptsidx], nf3); + const auto cnow = c[nuptsidx]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - const int nuptsidx = idxnupts[ptstart + i]; - const T x_rescaled = fold_rescale(x[nuptsidx], nf1); - const T y_rescaled = fold_rescale(y[nuptsidx], nf2); - const T z_rescaled = fold_rescale(z[nuptsidx], nf3); - cuda_complex cnow = c[nuptsidx]; - - const int xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - const int ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - const int zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - const int xend = floor(x_rescaled + ns / 2.0) - xoffset; - const int yend = floor(y_rescaled + ns / 2.0) - yoffset; - const int zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - const T kervalue3 = ker3[zz - zstart]; - const int iz = zz + ceil(ns / 2.0); - if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0) - break; - for (int yy = ystart; yy <= yend; yy++) { - const T kervalue2 = ker2[yy - ystart]; - const int iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - const int ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - const int outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - const T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const T kervalue3 = ker3[zz - zstart]; + const int iz = zz + ns_2; + if (iz >= (bin_size_z + (int)rounded_ns) || iz < 0) break; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + const int iy = yy + ns_2; + if (iy >= (bin_size_y + (int)rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx + ns_2; + if (ix >= (bin_size_x + (int)rounded_ns) || ix < 0) break; + const int outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3; + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); } + } } - __syncthreads(); - - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - const int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - const int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - const int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - int ix = xoffset - ceil(ns / 2.0) + i; - int iy = yoffset - ceil(ns / 2.0) + j; - int iz = zoffset - ceil(ns / 2.0) + k; - - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - const int outidx = ix + iy * nf1 + iz * nf1 * nf2; - const int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + } + __syncthreads(); + + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + const int i = n % (bin_size_x + rounded_ns); + const int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + const int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + + int ix = xoffset - ns_2 + i; + int iy = yoffset - ns_2 + j; + int iz = zoffset - ns_2 + k; + + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } + } } /* Kernels for BlockGather Method */ -template -__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, - int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, - int *bin_size, const T *x, const T *y, const T *z, int *sortidx, - int nf1, int nf2, int nf3) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } +template +__global__ void locate_nupts_to_bins_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, + const T *x, const T *y, const T *z, int *sortidx, int nf1, int nf2, int nf3) { + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, - int nobinx, int nobiny, int nobinz, int binsperobinx, - int binsperobiny, int binsperobinz, int *bin_startpts, - const int *sortidx, const T *x, const T *y, const T *z, - int *index, int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_startpts, + const int *sortidx, const T *x, const T *y, const T *z, int *index, int nf1, int nf2, + int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *binstartpts, int obin_size_x, int obin_size_y, - int obin_size_z, int binsperobin, int *subprob_to_bin, - const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny, - int nobinz, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, zstart, xend, yend, zend; - int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; - int subpidx = blockIdx.x; - int obidx = subprob_to_bin[subpidx]; - int bidx = obidx * binsperobin; - - int obinsubp_idx = subpidx - subprobstartpts[obidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; - int nupts = - min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - obinsubp_idx * maxsubprobsize); - - int xoffset = (obidx % nobinx) * obin_size_x; - int yoffset = (obidx / nobinx) % nobiny * obin_size_y; - int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; - - int N = obin_size_x * obin_size_y * obin_size_z; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_block_gather( + const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z, + int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, + int nobinx, int nobiny, int nobinz, const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + const int subpidx = blockIdx.x; + const int obidx = subprob_to_bin[subpidx]; + const int bidx = obidx * binsperobin; + + const int obinsubp_idx = subpidx - subprobstartpts[obidx]; + const int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; + const int nupts = + min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - + obinsubp_idx * maxsubprobsize); + + const int xoffset = (obidx % nobinx) * obin_size_x; + const int yoffset = (obidx / nobinx) % nobiny * obin_size_y; + const int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; + + const int N = obin_size_x * obin_size_y * obin_size_z; + +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int nidx = idxnupts[ptstart + i]; + int b = nidx / M; + int box[3]; + for (int &d : box) { + d = b % 3; + if (d == 1) d = -1; + if (d == 2) d = 1; + b = b / 3; + } + const int ii = nidx % M; + const auto x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; + const auto y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; + const auto z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; + const auto cnow = c[ii]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int nidx = idxnupts[ptstart + i]; - int b = nidx / M; - int box[3]; - for (int d = 0; d < 3; d++) { - box[d] = b % 3; - if (box[d] == 1) - box[d] = -1; - if (box[d] == 2) - box[d] = 1; - b = b / 3; - } - int ii = nidx % M; - x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; - y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; - z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; - cnow = c[ii]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - xstartnew = xstart < 0 ? 0 : xstart; - ystartnew = ystart < 0 ? 0 : ystart; - zstartnew = zstart < 0 ? 0 : zstart; - xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; - yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; - zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; - - for (int zz = zstartnew; zz <= zendnew; zz++) { - T kervalue3 = ker3[zz - zstart]; - for (int yy = ystartnew; yy <= yendnew; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstartnew; xx <= xendnew; xx++) { - outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; - T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + const auto xstartnew = xstart < 0 ? 0 : xstart; + const auto ystartnew = ystart < 0 ? 0 : ystart; + const auto zstartnew = zstart < 0 ? 0 : zstart; + const auto xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; + const auto yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; + const auto zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; + + for (int zz = zstartnew; zz <= zendnew; zz++) { + const T kervalue3 = ker3[zz - zstart]; + for (int yy = ystartnew; yy <= yendnew; yy++) { + const T kervalue2 = ker2[yy - ystart]; + for (int xx = xstartnew; xx <= xendnew; xx++) { + const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; + const T kervalue1 = ker1[xx - xstart]; + const cuda_complex res{cnow.x * kervalue1 * kervalue2 * kervalue3, + cnow.y * kervalue1 * kervalue2 * kervalue3}; + atomicAddComplexShared(fwshared + outidx, res); } + } } - __syncthreads(); - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % obin_size_x; - int j = (n / obin_size_x) % obin_size_y; - int k = n / (obin_size_x * obin_size_y); - - ix = xoffset + i; - iy = yoffset + j; - iz = zoffset + k; - outidx = ix + iy * nf1 + iz * nf1 * nf2; - atomicAdd(&fw[outidx].x, fwshared[n].x); - atomicAdd(&fw[outidx].y, fwshared[n].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % obin_size_x; + int j = (n / obin_size_x) % obin_size_y; + int k = n / (obin_size_x * obin_size_y); + + const auto ix = xoffset + i; + const auto iy = yoffset + j; + const auto iz = zoffset + k; + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; + atomicAdd(&fw[outidx].x, fwshared[n].x); + atomicAdd(&fw[outidx].y, fwshared[n].y); + } } /* ---------------------- 3d Interpolation Kernels ---------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex *c, - const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, - T es_beta, T sigma, int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - T z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart - z_rescaled, ns, es_c, es_beta); - } +template +__global__ void interp_3d_nupts_driven( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + cuda_complex cnow{0, 0}; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int inidx = ix + iy * nf1 + iz * nf2 * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int inidx = ix + iy * nf1 + iz * nf2 * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + } } + c[idxnupts[i]] = cnow; + } } /* Kernels for SubProb Method */ -template -__global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, - const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, - int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend, zstart, zend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - iz = zoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_3d_subprob( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, + int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const auto zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % (bin_size_x + rounded_ns); + int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + auto iz = zoffset - ns_2 + k; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; + int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + fwshared[sharedidx] = fw[outidx]; + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - iz = zz + ceil(ns / 2.0); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - iy = yy + ceil(ns / 2.0); - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + const auto iz = zz + ns_2; + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + const auto iy = yy + ns_2; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[idx]].x = cnow.x; - c[idxnupts[idx]].y = cnow.y; + } } + c[idxnupts[idx]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 77b86ae77..ae9431c31 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,7 +1,3 @@ -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) @@ -47,8 +43,14 @@ target_include_directories(cufinufft_common_objects set_target_properties( cufinufft_common_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) - + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) +target_compile_options( + cufinufft_common_objects + PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) target_compile_options( cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -58,24 +60,32 @@ target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_target_properties( cufinufft_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_objects PRIVATE cxx_std_17) target_compile_options( cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) if(FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED $ $) - set_target_properties( - cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) else() add_library(cufinufft STATIC $ $) - set_target_properties( - cufinufft PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) endif() +set_target_properties( + cufinufft + PROPERTIES CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) +target_compile_features(cufinufft PRIVATE cxx_std_17) + if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) diff --git a/src/cuda/common.cu b/src/cuda/common.cu index c6bf8315d..b19986520 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -199,6 +199,119 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z) { + // Helper to compute the shared memory required for the spreader when using SM + int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; + + if (dim == 1) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_y + ((ns + 1) / 2) * 2); + + if (dim == 2) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_z + ((ns + 1) / 2) * 2); + + return adjusted_ns * sizeof(cuda_complex); +} + +// Function to find bin_size_x == bin_size_y +// where bin_size_x * bin_size_y * bin_size_z < mem_size +// TODO: this can be done without a loop by using a direct formula +template int find_bin_size(std::size_t mem_size, int dim, int ns) { + int binsize = 1; // Start with the smallest possible bin size + while (true) { + // Calculate the shared memory required for the current bin_size_x and bin_size_y + std::size_t required_memory = + shared_memory_required(dim, ns, binsize, binsize, binsize); + + // Check if the required memory is less than the available memory + if (required_memory > mem_size) { + // If the condition is met, return the current bin_size_x + return binsize - 1; + } + + // Increment bin_size_x for the next iteration + binsize++; + } +} + +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + // Marco Barbone 07/26/24. Using the shared memory available on the device, to + // determine the optimal binsize for the spreader. + // WARNING: This function does not check for CUDA errors, the caller should check and + // handle them. + // TODO: This can still be improved some sizes are hardcoded still + int shared_mem_per_block{}, device_id{}; + switch (dim) { + case 1: { + if (opts->gpu_binsizex == 0) { + cudaGetDevice(&device_id); + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + // CUDA error handled by the caller not checking them here. + // use 1/6 of the shared memory for the binsize + // From experiments on multiple GPUs this gives the best tradeoff. + // It is within 90% of the maximum performance for all GPUs tested. + shared_mem_per_block /= 6; + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + opts->gpu_binsizex = bin_size; + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0) { + switch (opts->gpu_method) { + case 0: + case 2: { + opts->gpu_binsizex = 32; + opts->gpu_binsizey = 32; + } break; + case 1: { + cudaGetDevice(&device_id); + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + const auto binsize = find_bin_size(shared_mem_per_block, dim, ns); + // in 2D 1/6 is too small, it gets slower because of the excessive padding + opts->gpu_binsizex = binsize; + opts->gpu_binsizey = binsize; + } break; + } + } + opts->gpu_binsizez = 1; + } break; + case 3: { + switch (opts->gpu_method) { + case 0: + case 1: + case 2: { + if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0 || opts->gpu_binsizez == 0) { + opts->gpu_binsizex = 16; + opts->gpu_binsizey = 16; + opts->gpu_binsizez = 2; + } + } break; + case 4: { + opts->gpu_obinsizex = (opts->gpu_obinsizex == 0) ? 8 : opts->gpu_obinsizex; + opts->gpu_obinsizey = (opts->gpu_obinsizey == 0) ? 8 : opts->gpu_obinsizey; + opts->gpu_obinsizez = (opts->gpu_obinsizez == 0) ? 8 : opts->gpu_obinsizez; + opts->gpu_binsizex = (opts->gpu_binsizex == 0) ? 4 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey == 0) ? 4 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez == 0) ? 4 : opts->gpu_binsizez; + } break; + } + } break; + } +} + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, finufft_spread_opts opts); @@ -227,5 +340,15 @@ template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts); template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts); + +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); + +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); } // namespace common } // namespace cufinufft diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index c0066d049..c00bf8eba 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -102,26 +102,26 @@ void cufinufft_default_opts(cufinufft_opts *opts) { // sphinx tag (don't remove): @gpu_defopts_start // data handling opts... - opts->modeord = 0; + opts->modeord = 0; opts->gpu_device_id = 0; // diagnostic opts... opts->gpu_spreadinterponly = 0; // algorithm performance opts... - opts->gpu_method = 0; - opts->gpu_sort = 1; - opts->gpu_kerevalmeth = 1; - opts->upsampfac = 2.0; + opts->gpu_method = 0; + opts->gpu_sort = 1; + opts->gpu_kerevalmeth = 1; + opts->upsampfac = 2.0; opts->gpu_maxsubprobsize = 1024; - opts->gpu_obinsizex = -1; - opts->gpu_obinsizey = -1; - opts->gpu_obinsizez = -1; - opts->gpu_binsizex = -1; - opts->gpu_binsizey = -1; - opts->gpu_binsizez = -1; - opts->gpu_maxbatchsize = 0; - opts->gpu_stream = cudaStreamDefault; + opts->gpu_obinsizex = 0; + opts->gpu_obinsizey = 0; + opts->gpu_obinsizez = 0; + opts->gpu_binsizex = 0; + opts->gpu_binsizey = 0; + opts->gpu_binsizez = 0; + opts->gpu_maxbatchsize = 0; + opts->gpu_stream = cudaStreamDefault; // sphinx tag (don't remove): @gpu_defopts_end } } diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu index 66cc5ca69..b2c0c292f 100644 --- a/src/cuda/precision_independent.cu +++ b/src/cuda/precision_independent.cu @@ -52,13 +52,6 @@ __global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstart } } -__global__ void trivial_global_sort_index_1d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; - i += gridDim.x * blockDim.x) { - index[i] = i; - } -} - /* spreadinterp 2d */ __global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index 6ff91f8ca..98b5382bc 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -22,7 +22,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet // Must call before any kernel evals done. // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h) { - if (upsampfac != 2.0) { // nonstandard sigma + if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", @@ -69,7 +69,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet ier = FINUFFT_WARN_EPS_TOO_SMALL; } opts.nspread = ns; - opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner) + opts.ES_halfwidth = T(ns * .5); // constants to help ker eval (except Horner) opts.ES_c = 4.0 / (T)(ns * ns); T betaoverns = 2.30; // gives decent betas for default sigma=2.0 diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 8d77d9fdc..d9c5d312b 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -1,4 +1,3 @@ - file(GLOB test_src "*.c*") foreach(srcfile ${test_src}) @@ -6,79 +5,83 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} PUBLIC cufinufft m) - set_target_properties(${executable} PROPERTIES - LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} - ) + find_library(MathLib m) + if(MathLib) + target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) + endif() + target_compile_features(${executable} PUBLIC cxx_std_17) + set_target_properties( + ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES + ${FINUFFT_CUDA_ARCHITECTURES}) message(STATUS "Adding test ${executable}" - " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" - " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}" - ) + " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" + " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") endforeach() -function(add_tests PREC REQ_TOL CHECK_TOL) - add_test( - NAME cufinufft1d1_test_GM_${PREC} - COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) +function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) + add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test( - NAME cufinufft1d1_test_SM_${PREC} - COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test( - NAME cufinufft1d2_test_GM_${PREC} - COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test( - NAME cufinufft2d1_test_GM_${PREC} - COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft2d1_test_SM_${PREC} - COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft2d1many_test_GM_${PREC} - COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft2d1many_test_SM_${PREC} - COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft2d2many_test_GM_${PREC} - COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft2d2many_test_SM_${PREC} - COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft3d1_test_GM_${PREC} - COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) - if (${PREC} STREQUAL "float") - add_test( - NAME cufinufft3d1_test_SM_${PREC} - COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + if(${PREC} STREQUAL "float") + add_test(NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft3d1_test_block_${PREC} - COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft3d1_test_block_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) - add_test( - NAME cufinufft3d2_test_SM_${PREC} - COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) endif() - add_test( - NAME cufinufft3d2_test_GM_${PREC} - COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) endfunction() -add_tests(float 1e-5 2e-4) -add_tests(double 1e-12 1e-11) +add_tests(float 1e-5 2e-4 2.0) +add_tests(double 1e-12 1e-11 2.0) +add_tests(float 1e-5 2e-4 1.25) +add_tests(double 1e-8 1e-7 1.25) add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index 05b62025e..dbd6260ac 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -17,7 +17,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -88,6 +89,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) opts.gpu_method = method; opts.gpu_maxbatchsize = 1; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, 1, 1}; int ntransf = 1; @@ -178,7 +180,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) } int main(int argc, char *argv[]) { - if (argc != 8) { + if (argc != 9) { fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n" "Arguments:\n" " method: One of\n" @@ -188,21 +190,23 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " precision: f or d\n"); + " precision: f or d\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int M = atof(argv[4]); - const double tol = atof(argv[5]); - const double checktol = atof(argv[6]); - const int iflag = 1; - const char prec = argv[7][0]; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int M = atof(argv[4]); + const double tol = atof(argv[5]); + const double checktol = atof(argv[6]); + const int iflag = 1; + const char prec = argv[7][0]; + const double upsampfac = atof(argv[8]); if (prec == 'f') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 4157f6230..f3b767f2e 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -18,7 +18,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); thrust::host_vector x(M), y(M); @@ -88,9 +89,9 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int opts.gpu_method = method; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, 1}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, 1}; + int ntransf = 1; cudaEventRecord(start); int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -178,7 +179,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int } int main(int argc, char *argv[]) { - if (argc != 9) { + if (argc != 10) { fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n" "Arguments:\n" " method: One of\n" @@ -189,23 +190,25 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int M = atof(argv[5]); - const double tol = atof(argv[6]); - const double checktol = atof(argv[7]); - const char prec = argv[8][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int M = atof(argv[5]); + const double tol = atof(argv[6]); + const double checktol = atof(argv[7]); + const char prec = argv[8][0]; + const double upsampfac = atof(argv[9]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index b4f3529e1..4afcd97dd 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, - T tol, T checktol, int iflag) { + T tol, T checktol, int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -93,6 +93,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize opts.gpu_method = method; opts.gpu_maxbatchsize = maxbatchsize; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, N2, 1}; cudaEventRecord(start); @@ -184,7 +185,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize } int main(int argc, char *argv[]) { - if (argc != 11) { + if (argc != 12) { fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M " "tol checktol prec\n" @@ -199,7 +200,8 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } const int method = atoi(argv[1]); @@ -212,14 +214,15 @@ int main(int argc, char *argv[]) { const double tol = atof(argv[8]); const double checktol = atof(argv[9]); const char prec = argv[10][0]; + const double upsampfac = atof(argv[11]); const int iflag = 1; if (prec == 'f') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else if (prec == 'd') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index 933dda36d..67818c2b2 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, - int iflag) { + int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -94,9 +94,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check opts.gpu_method = method; opts.gpu_kerevalmeth = 1; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, N3}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, N3}; + int ntransf = 1; cudaEventRecord(start); ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -190,7 +190,7 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check } int main(int argc, char *argv[]) { - if (argc < 10) { + if (argc != 11) { fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n" "Arguments:\n" @@ -203,24 +203,26 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsamplefac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int N3 = atof(argv[5]); - const int M = atof(argv[6]); - const double tol = atof(argv[7]); - const double checktol = atof(argv[8]); - const char prec = argv[9][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int N3 = atof(argv[5]); + const int M = atof(argv[6]); + const double tol = atof(argv[7]); + const double checktol = atof(argv[8]); + const char prec = argv[9][0]; + const double upsampfac = atof(argv[10]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else return -1; }