diff --git a/CHANGELOG b/CHANGELOG
index c882f716f..cf842bb67 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -54,6 +54,17 @@ V 2.3.0beta (7/24/24)
 * cmake adding nvcc and msvc optimization flags
 * cmake supports sphinx
 * updated install docs
+* cuFINUFFT binsize is now a function of the shared memory available where
+  possible.
+* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
+* cuFINUFFT using the new normalized Horner coefficients and added support
+  for 1.25.
+* cuFINUFFT new compile flags for extra-vectorization, flushing single
+  precision denormals to 0 and using fma where possible.
+* cuFINUFFT using intrinsics in foldrescale and other places to increase
+  performance
+* cuFINUFFT using SM90 float2 vector atomicAdd where supported
+* cuFINUFFT making default binsize = 0
 
 V 2.2.0 (12/12/23)
 
diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt
index 9a376408e..45b9a5989 100644
--- a/devel/CMakeLists.txt
+++ b/devel/CMakeLists.txt
@@ -2,23 +2,25 @@ project(finufft_devel)
 # Set the minimum required version of CMake
 cmake_minimum_required(VERSION 3.5)
 
-
 # include cpm cmake, downloading it
-CPMAddPackage(
-    NAME benchmark
-    GITHUB_REPOSITORY google/benchmark
-    VERSION 1.8.3
-    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
-
-)
+cpmaddpackage(
+  NAME
+  benchmark
+  GITHUB_REPOSITORY
+  google/benchmark
+  VERSION
+  1.8.3
+  OPTIONS
+  "BENCHMARK_ENABLE_TESTING OFF")
 
-if (benchmark_ADDED)
-    # patch benchmark target
-    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+if(benchmark_ADDED)
+  # patch benchmark target
+  set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
 endif()
 
 add_executable(foldrescale foldrescale.cpp)
 target_link_libraries(foldrescale finufft benchmark xsimd)
 add_executable(padding padding.cpp)
+target_compile_features(padding PRIVATE cxx_std_17)
 target_link_libraries(padding finufft xsimd)
 target_compile_options(padding PRIVATE -march=native)
diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
index 009e05ea4..51aa4e4e1 100644
--- a/devel/gen_all_horner_C_code.m
+++ b/devel/gen_all_horner_C_code.m
@@ -12,12 +12,12 @@
 
 for upsampfac = [2.0, 1.25];   % sigma: either 2 (default) or low (eg 5/4)
   fprintf('upsampfac = %g...\n',upsampfac)
-  
+
   ws = 2:16;
-  opts.wpad = true;    % pad kernel eval to multiple of 4
+  opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
-  else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  if (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end
diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m
index e2dd1b75a..059b6a4e1 100644
--- a/devel/gen_ker_horner_loop_C_code.m
+++ b/devel/gen_ker_horner_loop_C_code.m
@@ -38,9 +38,9 @@
   width = w;
 end
 for n=1:d+1                 % loop over poly coeff powers
-  s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
+  s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
   for i=2:width            % loop over segments
-    s = sprintf('%s, %.16E', s, C(n,i));      
+    s = sprintf('%s, %.16E', s, C(n,i));
   end
   str{n} = [s sprintf('};\n')];
 end
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 35ac5662c..27b193cd5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
 
 foreach(EXAMPLE ${EXAMPLES})
   add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
   enable_asan(${EXAMPLE})
 endforeach()
@@ -18,6 +19,7 @@ endforeach()
 foreach(EXAMPLE ${EXAMPLES_C})
   add_executable(${EXAMPLE} ${EXAMPLE}.c)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   enable_asan(${EXAMPLE})
 endforeach()
 
@@ -25,6 +27,7 @@ if(FINUFFT_USE_OPENMP)
   foreach(EXAMPLE ${EXAMPLES_OPENMP})
     add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
     target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
+    target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
     enable_asan(${EXAMPLE})
   endforeach()
 endif()
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
index 0c9dba361..b9742a865 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/cuda/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 file(GLOB example_src "*.cpp")
 
 foreach(srcfile ${example_src})
@@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
   add_executable(${executable} ${srcfile})
   target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
   target_link_libraries(${executable} cufinufft)
+  target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
index 7bddc188e..efa7eb7b1 100644
--- a/include/cufinufft/common.h
+++ b/include/cufinufft/common.h
@@ -4,6 +4,7 @@
 #include <cufft.h>
 #include <cufinufft/types.h>
 #include <cufinufft_opts.h>
+#include <finufft_errors.h>
 #include <finufft_spread_opts.h>
 
 #include <complex.h>
@@ -32,6 +33,38 @@ template<typename T>
 void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    T *fwkerhalf, finufft_spread_opts opts);
 
+template<typename T>
+std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
+                                   int bin_size_z);
+
+template<typename T>
+void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
+
+template<typename T, typename V>
+auto cufinufft_set_shared_memory(V *kernel, const int dim,
+                                 const cufinufft_plan_t<T> &d_plan) {
+  /**
+   * WARNING: this function does not handle cuda errors. The caller should check them.
+   */
+  int device_id{}, shared_mem_per_block{};
+  cudaGetDevice(&device_id);
+  const auto shared_mem_required =
+      shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
+                                d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
+  cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                         device_id);
+  if (shared_mem_required > shared_mem_per_block) {
+    fprintf(stderr,
+            "Error: Shared memory required per block is %zu bytes, but the device "
+            "supports only %d bytes.\n",
+            shared_mem_required, shared_mem_per_block);
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       shared_mem_required);
+  return 0;
+}
+
 } // namespace common
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
index 3dade898e..c3a31bd2b 100644
--- a/include/cufinufft/contrib/helper_cuda.h
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
   return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
 }
 
-#define RETURN_IF_CUDA_ERROR                                         \
-  {                                                                  \
-    cudaError_t err = cudaGetLastError();                            \
-    if (err != cudaSuccess) {                                        \
-      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
-      return FINUFFT_ERR_CUDA_FAILURE;                               \
-    }                                                                \
+#define RETURN_IF_CUDA_ERROR                                                         \
+  {                                                                                  \
+    cudaError_t err = cudaGetLastError();                                            \
+    if (err != cudaSuccess) {                                                        \
+      printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
+             __FILE__, __LINE__);                                                    \
+      return FINUFFT_ERR_CUDA_FAILURE;                                               \
+    }                                                                                \
   }
 
 #define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \
diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc
index 32f2cff00..1f4c59e2a 100644
--- a/include/cufinufft/contrib/ker_horner_allw_loop.inc
+++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc
@@ -1,216 +1,205 @@
 // Code generated by gen_all_horner_C_code.m in finufft/devel
 // Authors: Alex Barnett & Ludvig af Klinteberg.
-// (C) 2018, The Simons Foundation, Inc.
+// (C) The Simons Foundation, Inc.
   if (w==2) {
-    CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01};
-    CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01};
-    CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00};
-    CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01};
-    CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00};
-    CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00};
-    for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
+    constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01};
+    constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01};
+    constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02};
+    constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01};
+    for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i])));
   } else if (w==3) {
-    CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02};
-    CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02};
-    CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02};
-    CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01};
-    CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01};
-    CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
-    CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
-    for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
+    constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01};
+    constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01};
+    constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01};
+    constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02};
+    constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02};
+    constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03};
+    for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
   } else if (w==4) {
-    CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
-    CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03};
-    CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03};
-    CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02};
-    CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01};
-    CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01};
-    CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
-    CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
-    for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
+    constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02};
+    constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01};
+    constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01};
+    constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02};
+    constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03};
+    constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03};
+    constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04};
+    for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
   } else if (w==5) {
-    CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
-    CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03};
-    CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03};
-    CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03};
-    CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02};
-    CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01};
-    CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01};
-    CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
-    CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
-    for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
+    constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02};
+    constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02};
+    constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02};
+    constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02};
+    constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03};
+    constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04};
+    constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04};
+    constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05};
+    for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
   } else if (w==6) {
-    CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
-    CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03};
-    CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04};
-    CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03};
-    CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03};
-    CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02};
-    CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02};
-    CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01};
-    CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00};
-    CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00};
-    for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
+    constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03};
+    constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03};
+    constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02};
+    constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03};
+    constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03};
+    constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04};
+    constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04};
+    constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05};
+    constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06};
+    for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
   } else if (w==7) {
-    CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03};
-    CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04};
-    CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04};
-    CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04};
-    CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04};
-    CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03};
-    CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02};
-    CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02};
-    CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01};
-    CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01};
-    CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00};
-    for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
+    constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04};
+    constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03};
+    constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03};
+    constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03};
+    constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03};
+    constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04};
+    constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05};
+    constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05};
+    constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06};
+    constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08};
+    for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
   } else if (w==8) {
-    CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03};
-    CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04};
-    CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04};
-    CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04};
-    CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04};
-    CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04};
-    CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03};
-    CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00};
-    CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02};
-    CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01};
-    CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01};
-    for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
+    constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05};
+    constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04};
+    constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04};
+    constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04};
+    constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04};
+    constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04};
+    constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05};
+    constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08};
+    constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06};
+    constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07};
+    for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
   } else if (w==9) {
-    CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04};
-    CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04};
-    CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05};
-    CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05};
-    CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04};
-    CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04};
-    CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04};
-    CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03};
-    CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01};
-    CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01};
-    CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00};
-    CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01};
-    for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i])))))))))));
+    constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05};
+    constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05};
+    constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04};
+    constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04};
+    constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05};
+    constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05};
+    constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05};
+    constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06};
+    constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08};
+    constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08};
+    constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08};
+    for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
   } else if (w==10) {
-    CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04};
-    CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05};
-    CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05};
-    CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05};
-    CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05};
-    CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05};
-    CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04};
-    CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03};
-    CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02};
-    CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02};
-    CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01};
-    CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00};
-    CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01};
-    for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i]))))))))))));
+    constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06};
+    constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05};
+    constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05};
+    constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05};
+    constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05};
+    constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05};
+    constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06};
+    constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07};
+    constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08};
+    constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08};
+    constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09};
+    constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10};
+    for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i])))))))))));
   } else if (w==11) {
-    CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04};
-    CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05};
-    CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05};
-    CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05};
-    CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05};
-    CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05};
-    CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05};
-    CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04};
-    CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03};
-    CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02};
-    CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01};
-    CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01};
-    CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00};
-    CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01};
-    for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i])))))))))))));
+    constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07};
+    constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06};
+    constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06};
+    constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06};
+    constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06};
+    constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06};
+    constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06};
+    constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07};
+    constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08};
+    constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09};
+    constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10};
+    constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10};
+    constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11};
+    for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i]))))))))))));
   } else if (w==12) {
-    CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04};
-    CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05};
-    CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05};
-    CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06};
-    CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05};
-    CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05};
-    CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05};
-    CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04};
-    CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04};
-    CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03};
-    CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01};
-    CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01};
-    CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01};
-    CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01};
+    constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08};
+    constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07};
+    constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07};
+    constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06};
+    constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06};
+    constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07};
+    constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07};
+    constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08};
+    constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08};
+    constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09};
+    constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11};
+    constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11};
+    constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11};
+    constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13};
     for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i])))))))))))));
   } else if (w==13) {
-    CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04};
-    CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05};
-    CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06};
-    CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06};
-    CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06};
-    CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06};
-    CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05};
-    CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05};
-    CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04};
-    CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04};
-    CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03};
-    CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01};
-    CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01};
-    CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00};
-    CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01};
+    constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08};
+    constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08};
+    constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07};
+    constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07};
+    constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07};
+    constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07};
+    constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08};
+    constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08};
+    constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09};
+    constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09};
+    constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10};
+    constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12};
+    constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12};
+    constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13};
+    constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14};
     for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i]))))))))))))));
   } else if (w==14) {
-    CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05};
-    CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05};
-    CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06};
-    CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06};
-    CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06};
-    CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06};
-    CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06};
-    CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05};
-    CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05};
-    CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04};
-    CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03};
-    CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02};
-    CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01};
-    CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01};
-    CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00};
-    CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02};
+    constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09};
+    constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09};
+    constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08};
+    constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08};
+    constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08};
+    constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08};
+    constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08};
+    constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09};
+    constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09};
+    constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10};
+    constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11};
+    constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12};
+    constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13};
+    constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13};
+    constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14};
+    constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16};
     for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i])))))))))))))));
   } else if (w==15) {
-    CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05};
-    CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06};
-    CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06};
-    CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06};
-    CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06};
-    CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06};
-    CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06};
-    CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06};
-    CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05};
-    CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05};
-    CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04};
-    CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03};
-    CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02};
-    CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01};
-    CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00};
-    CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01};
-    CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03};
+    constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10};
+    constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09};
+    constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09};
+    constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09};
+    constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09};
+    constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09};
+    constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09};
+    constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09};
+    constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10};
+    constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10};
+    constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11};
+    constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12};
+    constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13};
+    constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14};
+    constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15};
+    constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16};
+    constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18};
     for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i]))))))))))))))));
   } else if (w==16) {
-    CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05};
-    CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06};
-    CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06};
-    CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07};
-    CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07};
-    CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07};
-    CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06};
-    CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06};
-    CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06};
-    CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05};
-    CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04};
-    CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04};
-    CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03};
-    CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01};
-    CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01};
-    CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00};
-    CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01};
-    CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02};
+    constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11};
+    constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10};
+    constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10};
+    constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09};
+    constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09};
+    constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09};
+    constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10};
+    constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10};
+    constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10};
+    constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11};
+    constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12};
+    constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12};
+    constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13};
+    constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15};
+    constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15};
+    constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16};
+    constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17};
+    constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18};
     for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i])))))))))))))))));
   } else
     printf("width not implemented!\n");
diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc
new file mode 100644
index 000000000..e2fa229b7
--- /dev/null
+++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc
@@ -0,0 +1,171 @@
+// Code generated by gen_all_horner_C_code.m in finufft/devel
+// Authors: Alex Barnett & Ludvig af Klinteberg.
+// (C) The Simons Foundation, Inc.
+  if (w==2) {
+    constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01};
+    constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01};
+    constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02};
+    constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01};
+    for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i])));
+  } else if (w==3) {
+    constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01};
+    constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01};
+    constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01};
+    constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02};
+    constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02};
+    for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i]))));
+  } else if (w==4) {
+    constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02};
+    constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01};
+    constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01};
+    constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02};
+    constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02};
+    constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03};
+    for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
+  } else if (w==5) {
+    constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02};
+    constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02};
+    constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02};
+    constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02};
+    constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05};
+    constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03};
+    for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
+  } else if (w==6) {
+    constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03};
+    constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02};
+    constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02};
+    constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02};
+    constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03};
+    constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04};
+    constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04};
+    for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
+  } else if (w==7) {
+    constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03};
+    constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03};
+    constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03};
+    constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03};
+    constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03};
+    constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04};
+    constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05};
+    constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05};
+    for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
+  } else if (w==8) {
+    constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04};
+    constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03};
+    constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03};
+    constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03};
+    constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04};
+    constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04};
+    constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07};
+    constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06};
+    for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
+  } else if (w==9) {
+    constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04};
+    constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04};
+    constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04};
+    constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04};
+    constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04};
+    constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05};
+    constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06};
+    constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07};
+    constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07};
+    for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
+  } else if (w==10) {
+    constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05};
+    constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04};
+    constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04};
+    constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04};
+    constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05};
+    constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05};
+    constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06};
+    constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07};
+    constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07};
+    constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08};
+    for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
+  } else if (w==11) {
+    constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06};
+    constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05};
+    constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05};
+    constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05};
+    constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05};
+    constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05};
+    constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06};
+    constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07};
+    constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09};
+    constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09};
+    for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
+  } else if (w==12) {
+    constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06};
+    constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06};
+    constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05};
+    constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05};
+    constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06};
+    constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06};
+    constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06};
+    constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07};
+    constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08};
+    constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09};
+    constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10};
+    for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
+  } else if (w==13) {
+    constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07};
+    constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06};
+    constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06};
+    constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06};
+    constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06};
+    constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06};
+    constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07};
+    constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08};
+    constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08};
+    constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10};
+    constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10};
+    constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11};
+    for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i])))))))))));
+  } else if (w==14) {
+    constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07};
+    constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07};
+    constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07};
+    constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06};
+    constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07};
+    constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07};
+    constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07};
+    constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08};
+    constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09};
+    constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10};
+    constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11};
+    constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11};
+    constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12};
+    for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i]))))))))))));
+  } else if (w==15) {
+    constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08};
+    constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07};
+    constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07};
+    constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07};
+    constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07};
+    constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07};
+    constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08};
+    constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08};
+    constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09};
+    constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10};
+    constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11};
+    constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12};
+    constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13};
+    for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i]))))))))))));
+  } else if (w==16) {
+    constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09};
+    constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08};
+    constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08};
+    constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08};
+    constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08};
+    constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08};
+    constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08};
+    constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09};
+    constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09};
+    constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10};
+    constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11};
+    constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14};
+    constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13};
+    constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14};
+    for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i])))))))))))));
+  } else
+    printf("width not implemented!\n");
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 826319516..3a9fd6877 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -39,39 +39,6 @@ template<typename T>
 int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan);
 
-static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) {
-  switch (dim) {
-  case 1: {
-    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
-    opts->gpu_binsizey = 1;
-    opts->gpu_binsizez = 1;
-  } break;
-  case 2: {
-    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
-    opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
-    opts->gpu_binsizez = 1;
-  } break;
-  case 3: {
-    switch (opts->gpu_method) {
-    case 1:
-    case 2: {
-      opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
-      opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey;
-      opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez;
-    } break;
-    case 4: {
-      opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex;
-      opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey;
-      opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez;
-      opts->gpu_binsizex  = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex;
-      opts->gpu_binsizey  = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey;
-      opts->gpu_binsizez  = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez;
-    } break;
-    }
-  } break;
-  }
-}
-
 template<typename T>
 int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol,
                             cufinufft_plan_t<T> **d_plan_ptr, cufinufft_opts *opts) {
@@ -93,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
           Variables and arrays inside the plan struct are set and allocated.
 
       Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
+      Marco Barbone 07/26/24. Using SM when shared memory available is enough.
   */
   int ier;
   cuDoubleComplex *d_a = nullptr; // fseries temp data
@@ -109,17 +77,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
 
   // Mult-GPU support: set the CUDA Device ID:
-  const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
+  const int device_id = opts == nullptr ? 0 : opts->gpu_device_id;
   cufinufft::utils::WithCudaDevice device_swapper(device_id);
 
   /* allocate the plan structure, assign address to user pointer. */
-  cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
-  *d_plan_ptr                 = d_plan;
+  auto *d_plan = new cufinufft_plan_t<T>;
+  *d_plan_ptr  = d_plan;
   // Zero out your struct, (sets all pointers to NULL)
   memset(d_plan, 0, sizeof(*d_plan));
-
   /* If a user has not supplied their own options, assign defaults for them. */
-  if (opts == NULL) {     // use default opts
+  if (opts == nullptr) {  // use default opts
     cufinufft_default_opts(&(d_plan->opts));
   } else {                // or read from what's passed in
     d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
@@ -138,26 +105,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
 
   auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
-
-  /* Automatically set GPU method. */
-  if (d_plan->opts.gpu_method == 0) {
-    /* For type 1, we default to method 2 (SM) since this is generally faster.
-     * However, in the special case of _double precision_ in _three dimensions_
-     * with more than _three digits of precision_, there is note enough shared
-     * memory for this to work. As a result, we will default to method 1 (GM) in
-     * this special case.
-     *
-     * For type 2, we always default to method 1 (GM). */
-    if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
-      d_plan->opts.gpu_method = 2;
-    else if (type == 1 && tol < 1e-3)
-      d_plan->opts.gpu_method = 1;
-    else if (type == 2)
-      d_plan->opts.gpu_method = 1;
-  }
-
-  /* Setup Spreader */
   using namespace cufinufft::common;
+  /* Setup Spreader */
+
   // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     delete *d_plan_ptr;
@@ -170,7 +120,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   d_plan->mt  = nmodes[1];
   d_plan->mu  = nmodes[2];
 
-  cufinufft_setup_binsize(type, dim, &d_plan->opts);
+  cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
+  RETURN_IF_CUDA_ERROR
+
   CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
   set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
                 d_plan->opts.gpu_obinsizex);
@@ -180,6 +132,37 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   if (dim > 2)
     set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
                   d_plan->opts.gpu_obinsizez);
+
+  // dynamically request the maximum amount of shared memory available
+  // for the spreader
+
+  /* Automatically set GPU method. */
+  if (d_plan->opts.gpu_method == 0) {
+    /* For type 1, we default to method 2 (SM) since this is generally faster
+     * if there is enough shared memory available. Otherwise, we default to GM.
+     *
+     * For type 2, we always default to method 1 (GM).
+     */
+    if (type == 2) {
+      d_plan->opts.gpu_method = 1;
+    } else {
+      // query the device for the amount of shared memory available
+      int shared_mem_per_block{};
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      RETURN_IF_CUDA_ERROR
+      // compute the amount of shared memory required for the method
+      const auto shared_mem_required = shared_memory_required<T>(
+          dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+          d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+      if ((shared_mem_required > shared_mem_per_block)) {
+        d_plan->opts.gpu_method = 1;
+      } else {
+        d_plan->opts.gpu_method = 2;
+      }
+    }
+  }
+
   int fftsign = (iflag >= 0) ? 1 : -1;
 
   d_plan->nf1      = nf1;
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index da1c59930..2963d381d 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -2,16 +2,54 @@
 #define __CUSPREADINTERP_H__
 
 #include <cmath>
+#include <cuda.h>
 #include <cufinufft/types.h>
 #include <finufft_spread_opts.h>
 
 namespace cufinufft {
 namespace spreadinterp {
 
-template<typename T> static __forceinline__ __device__ T fold_rescale(T x, int N) {
-  static constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
-  const T result                   = x * x2pi + T(0.5);
-  return (result - floor(result)) * T(N);
+template<typename T>
+static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
+  if constexpr (std::is_same_v<T, float>) {
+    // fused multiply-add, round to nearest even
+    return __fmaf_rn(a, b, c);
+  } else if constexpr (std::is_same_v<T, double>) {
+    // fused multiply-add, round to nearest even
+    return __fma_rn(a, b, c);
+  }
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "Only float and double are supported.");
+  return T{0};
+}
+
+template<typename T>
+constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
+  constexpr auto x2pi = T(0.159154943091895345554011992339482617);
+  constexpr auto half = T(0.5);
+#if defined(__CUDA_ARCH__)
+  if constexpr (std::is_same_v<T, float>) {
+    // fused multiply-add, round to nearest even
+    auto result = __fmaf_rn(x, x2pi, half);
+    // subtract, round down
+    result = __fsub_rd(result, floorf(result));
+    // multiply, round down
+    return __fmul_rd(result, static_cast<T>(N));
+  } else if constexpr (std::is_same_v<T, double>) {
+    // fused multiply-add, round to nearest even
+    auto result = __fma_rn(x, x2pi, half);
+    // subtract, round down
+    result = __dsub_rd(result, floor(result));
+    // multiply, round down
+    return __dmul_rd(result, static_cast<T>(N));
+  } else {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Only float and double are supported.");
+  }
+#else
+  const auto result = std::fma(x, x2pi, half);
+  return (result - std::floor(result)) * static_cast<T>(N);
+#endif
 }
 
 template<typename T>
@@ -22,11 +60,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
    approximation to prolate spheroidal wavefunction (PSWF) of order 0.
    This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
 {
-  if (abs(x) >= opts.ES_halfwidth)
+  if (abs(x) >= T(opts.ES_halfwidth))
     // if spreading/FT careful, shouldn't need this if, but causes no speed hit
     return 0.0;
   else
-    return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
+    return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0));
 }
 
 template<typename T>
@@ -41,7 +79,9 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
    This is the "reference implementation", used by eg common/onedim_*
     2/17/17 */
 {
-  return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0;
+  return abs(x) < ns / T(2.0)
+             ? exp((T)es_beta * (sqrt((T)1.0 - (T)es_c * x * x) - (T)1.0))
+             : 0.0;
 }
 
 template<typename T>
@@ -52,13 +92,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
-  T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
+  const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
+  //  T z = 2 * x + w - 1.0;
   // insert the auto-generated code which expects z, w args, writes to ker...
   if (upsampfac == 2.0) { // floating point equality is fine here
-    using FLT           = T;
-    using CUFINUFFT_FLT = T;
+    using FLT = T;
 #include "cufinufft/contrib/ker_horner_allw_loop.inc"
   }
+  if (upsampfac == 1.25) { // floating point equality is fine here
+    using FLT = T;
+#include "cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc"
+  }
 }
 
 template<typename T>
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 3455b99c0..4bfaa801d 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -12,6 +12,9 @@
 
 #include <sys/time.h>
 
+#include <cuda.h>
+#include <type_traits>
+
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
@@ -68,6 +71,81 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
   }
   return sqrt(nrm);
 }
+
+#ifdef __CUDA_ARCH__
+__forceinline__ __device__ auto interval(const int ns, const float x) {
+  // float to int round up and fused multiply-add to round up
+  const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
+  // float to int round down and fused multiply-add to round down
+  const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
+  return int2{xstart, xend};
+}
+__forceinline__ __device__ auto interval(const int ns, const double x) {
+  // same as above
+  const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
+  const auto xend   = __double2int_rd(__fma_rd(ns, .5, x));
+  return int2{xstart, xend};
+}
+#endif
+
+// Define a macro to check if NVCC version is >= 11.3
+#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
+#if (__CUDACC_VER_MAJOR__ > 11) || \
+    (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
+
+#define ALLOCA_SUPPORTED 1
+// windows compatibility
+#if __has_include(<malloc.h>)
+#include <malloc.h>
+#endif
+#else
+#define ALLOCA_SUPPORTED 0
+#endif
+#else
+#define ALLOCA_SUPPORTED 0
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if __CUDA_ARCH__ >= 900
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
+#else
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
+#endif
+#else
+#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
+#endif
+
+/**
+ * does a complex atomic add on a shared memory address
+ * it adds the real and imaginary parts separately
+ * cuda does not support atomic operations
+ * on complex numbers on shared memory directly
+ */
+
+template<typename T>
+static __forceinline__ __device__ void atomicAddComplexShared(
+    cuda_complex<T> *address, cuda_complex<T> res) {
+  const auto raw_address = reinterpret_cast<T *>(address);
+  atomicAdd(raw_address, res.x);
+  atomicAdd(raw_address + 1, res.y);
+}
+
+/**
+ * does a complex atomic add on a global memory address
+ * since cuda 90 atomic operations on complex numbers
+ * on shared memory are supported so we leverage them
+ */
+template<typename T>
+static __forceinline__ __device__ void atomicAddComplexGlobal(
+    cuda_complex<T> *address, cuda_complex<T> res) {
+  if constexpr (
+      std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
+    atomicAdd(address, res);
+  } else {
+    atomicAddComplexShared<T>(address, res);
+  }
+}
+
 } // namespace utils
 } // namespace cufinufft
 
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
index 5f1079fde..ec95760fb 100644
--- a/perftest/cuda/CMakeLists.txt
+++ b/perftest/cuda/CMakeLists.txt
@@ -1,4 +1,10 @@
 add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 target_link_libraries(cuperftest PUBLIC cufinufft)
-#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
\ No newline at end of file
+target_compile_features(cuperftest PRIVATE cxx_std_17)
+set_target_properties(
+  cuperftest
+  PROPERTIES LINKER_LANGUAGE CUDA
+             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON)
diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
new file mode 100644
index 000000000..c22c2af9f
--- /dev/null
+++ b/perftest/cuda/bench.py
@@ -0,0 +1,200 @@
+import matplotlib.pyplot as plt
+import os
+import subprocess
+import pandas as pd
+import numpy as np
+import io
+cwd = os.getcwd()
+
+
+# function that runs a command line command and returns the output
+# it also takes a list of arguments to pass to the command
+def run_command(command, args):
+    # convert command and args to a string
+    try:
+        cmd = [command] + args
+        print("Running command:", ' '.join(cmd))
+        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        return result.stdout, result.stderr
+    except subprocess.CalledProcessError as e:
+        print('stdout output:\n', e.stdout)
+        print('stderr output:\n', e.stderr)
+        print("Error executing command:", e)
+
+
+# function that builds a string from a dictionary of arguments
+
+def build_args(args):
+    args_list = []
+    for key, value in args.items():
+        args_list.append(key)
+        args_list.append(value)
+    return args_list
+
+
+# function
+
+# example command to run:
+# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
+# example arguments
+args = {"--prec": "f",
+        "--n_runs": "5",
+        "--method": "0",
+        "--sort": "1",
+        "--N1": "16777216",
+        # "--N1": "256",
+        # "--N2": "256",
+        # "--N3": "256",
+        "--kerevalmethod": "1",
+        "--M": "1E8",
+        "--tol": "1E-6"}
+# iterate over tol from 1E-6 to 1E-1
+
+warmup = {"--prec": "f",
+        "--n_runs": "1",
+        "--method": "0",
+        "--N1": "256",
+        # "--N2": "256",
+        # "--N3": "256",
+        "--M": "256",
+        "--tol": "1E-1"}
+cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup)
+print("Warmup")
+stdout, stderr = run_command("nsys", cmd)
+print("Benchmarking")
+if stderr != '':
+    print(stderr)
+    exit(0)
+for precision in ['d']:
+    print(f"precision: {precision}")
+    for dim in range(1, 2):
+        if dim == 1:
+            args["--N1"] = "16777216"
+        if dim == 2:
+            args["--N1"] = "256"
+            args["--N2"] = "256"
+        if dim == 3:
+            args["--N1"] = "256"
+            args["--N2"] = "256"
+            args["--N3"] = "256"
+        args["--prec"] = precision
+        max_range = 16 if args["--prec"] == "d" else 7
+        if precision == 'd' and dim == 3:
+            max_range = 6
+        print(f"dimensions {dim}")
+        data = {
+            'method': [],
+            'throughput': [],
+            'tolerance': [],
+            # 'setpts': [],
+            'exec': [],
+        }
+        for i in range(1, max_range):
+            args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i)
+            print("Running with tol = 1E-" + str(i))
+            for method in ['2', '1']:
+                args["--method"] = method
+                if method == '0':
+                    data['method'].append('auto')
+                elif method == '1':
+                    data['method'].append('GM')
+                elif method == '2':
+                    data['method'].append('SM')
+                elif method == '4':
+                    data['method'].append('BLOCK')
+                print("Method " + data['method'][-1])
+                cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args)
+                stdout, stderr = run_command("nsys", cmd)
+                if stderr != '':
+                    print(stderr)
+                    exit(0)
+                # skip all lines starting with # in stdout
+                conf = [x for x in stdout.splitlines() if x.startswith("#")]
+                print('\n'.join(conf))
+                stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7]
+                if stdout[0].startswith("bin"):
+                    print(stdout[0])
+                    stdout = stdout[1:]
+
+                stdout = '\n'.join(stdout)
+                # convert stdout to a dataframe from csv string
+                dt = pd.read_csv(io.StringIO(stdout), sep=',')
+                setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value
+                exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value
+                # print(f'setpts pts/s: {setpts}')
+                # print(f'exec pts/s: {exec}')
+                cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep",
+                       "--format=csv", "--output", "cuperftest"]
+                stdout, _ = run_command("nsys", cmd)
+                # remove format from cmd
+                cmd = cmd[:-3]
+                # print(run_command("nsys", cmd))
+                # print(csv)
+                dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv")
+                # print(dt)
+                # sum the "Total Time" column of the ones that contain "fft" in name
+                # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")])
+                total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum()
+                # print(f'total_fft: {total_fft}')
+                # drop all the rows with spread not in "Name"
+                dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")]
+                # print(dt)
+                # exit(0)
+                # sort dt by column "Time (%)"
+                total_spread = dt['Duration (ns)'].sum() - total_fft
+                # print(f'total_spread: {total_spread}')
+                if total_fft > total_spread:
+                    print("Warning: total_fft > total_spread")
+                    # exit(0)
+                # pt/s
+                throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread
+                print(f'throughput: {throughput}')
+                data['throughput'].append(throughput)
+                data['tolerance'].append(args['--tol'])
+                # data['setpts'].append(setpts)
+                data['exec'].append(exec)
+        df = pd.DataFrame(data)
+        # Pivot the DataFrame
+        pivot_df = df.pivot(index='tolerance', columns='method')
+        # print(pivot_df)
+        # scale the throughput SM by GM
+        # pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM']
+        # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM']
+        # scale setpts SM by GM
+        # pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM']
+        # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM']
+        # remove the GM column
+        # pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True)
+        pivot_df.drop(('exec', 'GM'), axis=1, inplace=True)
+        pivot_df.drop(('exec', 'SM'), axis=1, inplace=True)
+        print(pivot_df)
+exit(0)
+# Plot
+pivot_df.plot(kind='bar', figsize=(10, 7))
+# Find the minimum throughput value
+min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min())
+max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max())
+print(min_val, max_val)
+plt.ylim(min_val * .90, max_val * 1.1)
+# plt.ylim(.8, 1.2)
+
+# Calculate the smallest power of 10
+# min_pow_10 = 10 ** np.floor(np.log10(min_throughput))
+
+# Adjust the plot's y-axis limits
+# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009)  # Adding 10% for upper margin
+
+# plot an horizontal line at 1 with label "GM"
+# plt.axhline(y=1, color='k', linestyle='--', label='GM')
+plt.xlabel('Tolerance')
+plt.ylabel('Throughput')
+plt.title('Throughput by Tolerance and Method')
+plt.legend(title='Method')
+plt.tight_layout()
+plt.show()
+plt.xlabel("Tolerance")
+plt.ylabel("Points/s")
+plt.savefig("bench.png")
+plt.savefig("bench.svg")
+plt.savefig("bench.pdf")
+plt.show()
diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh
new file mode 100644
index 000000000..9832e1088
--- /dev/null
+++ b/perftest/cuda/bench.sh
@@ -0,0 +1,13 @@
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
+./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
+#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10
+#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 26eaff491..a17b6f044 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -1,15 +1,11 @@
 #include <cmath>
 #include <complex>
 #include <cufinufft/contrib/helper_cuda.h>
-#include <iomanip>
-#include <iostream>
-#include <type_traits>
 
-#include <assert.h>
+#include <cassert>
 #include <cufft.h>
 
 #include <cufinufft/cudeconvolve.h>
-#include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/types.h>
 
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index cd3637c8b..2bf69f6a2 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -1,14 +1,10 @@
 #include <cuComplex.h>
 #include <cufinufft/contrib/helper_cuda.h>
-#include <iomanip>
 #include <iostream>
 
-#include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/types.h>
 
-using namespace cufinufft::memtransfer;
-
 #include "spreadinterp1d.cuh"
 
 namespace cufinufft {
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index 26fd5024c..1b2afde7d 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -1,12 +1,12 @@
 #include <cassert>
 #include <cufinufft/contrib/helper_cuda.h>
-#include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
@@ -15,6 +15,7 @@ using namespace cufinufft::common;
 using namespace cufinufft::memtransfer;
 
 #include "spreadinterp1d.cuh"
+#include <thrust/sort.h>
 
 namespace cufinufft {
 namespace spreadinterp {
@@ -50,10 +51,30 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
   return ier;
 }
 
+template<typename T> struct cmp : public thrust::binary_function<int, int, bool> {
+
+  cmp(const T *kx) : kx(kx) {}
+
+  __host__ __device__ bool operator()(const int a, const int b) const {
+    return fold_rescale(kx[a], 1) < fold_rescale(kx[b], 1);
+  }
+
+private:
+  const T *kx;
+};
+
 template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
   auto &stream = d_plan->stream;
-
+  if (d_plan->opts.gpu_sort && d_plan->opts.gpu_method == 1) {
+    int *d_idxnupts = d_plan->idxnupts;
+    thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M);
+    RETURN_IF_CUDA_ERROR
+    thrust::sort(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M,
+                 cmp{d_plan->kx});
+    RETURN_IF_CUDA_ERROR
+    return 0;
+  }
   if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     if (bin_size_x < 0) {
@@ -83,17 +104,16 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
     thrust::device_ptr<int> d_ptr(d_binsize);
     thrust::device_ptr<int> d_result(d_binstartpts);
     thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+    RETURN_IF_CUDA_ERROR
 
     calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
         M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
     RETURN_IF_CUDA_ERROR
   } else {
     int *d_idxnupts = d_plan->idxnupts;
-    trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
-                                                                             d_idxnupts);
+    thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M);
     RETURN_IF_CUDA_ERROR
   }
-
   return 0;
 }
 
@@ -133,7 +153,6 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
       RETURN_IF_CUDA_ERROR
     }
   }
-
   return 0;
 }
 
@@ -145,33 +164,29 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
     which only needs to be done once.
 */
 {
-  auto &stream = d_plan->stream;
-  int ier;
 
-  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  const auto maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  const auto bin_size_x     = d_plan->opts.gpu_binsizex;
   if (bin_size_x < 0) {
     std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = ("
               << bin_size_x << ")\n";
     return FINUFFT_ERR_BINSIZE_NOTVALID;
   }
 
-  int numbins = ceil((T)nf1 / bin_size_x);
-
-  T *d_kx = d_plan->kx;
-
-  int *d_binsize         = d_plan->binsize;
-  int *d_binstartpts     = d_plan->binstartpts;
-  int *d_sortidx         = d_plan->sortidx;
-  int *d_numsubprob      = d_plan->numsubprob;
-  int *d_subprobstartpts = d_plan->subprobstartpts;
-  int *d_idxnupts        = d_plan->idxnupts;
+  const auto numbins           = (nf1 + bin_size_x - 1) / bin_size_x;
+  const auto d_kx              = d_plan->kx;
+  const auto d_binsize         = d_plan->binsize;
+  const auto d_binstartpts     = d_plan->binstartpts;
+  const auto d_sortidx         = d_plan->sortidx;
+  const auto d_numsubprob      = d_plan->numsubprob;
+  const auto d_subprobstartpts = d_plan->subprobstartpts;
+  const auto d_idxnupts        = d_plan->idxnupts;
+  const auto stream            = d_plan->stream;
 
   int *d_subprob_to_bin = nullptr;
 
-  if ((ier =
-           checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
-    return ier;
+  cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream);
+  RETURN_IF_CUDA_ERROR
   calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
       M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx);
   RETURN_IF_CUDA_ERROR
@@ -192,30 +207,25 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
   d_ptr    = thrust::device_pointer_cast(d_numsubprob);
   d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
   thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+  RETURN_IF_CUDA_ERROR
 
-  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-    return ier;
+  cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream);
+  RETURN_IF_CUDA_ERROR
 
-  int totalnumsubprob;
-  if ((ier =
-           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
-                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
-    return ier;
+  int totalnumsubprob{};
+  cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int),
+                  cudaMemcpyDeviceToHost, stream);
   cudaStreamSynchronize(stream);
-  if ((ier = checkCudaErrors(
-           cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
-                             d_plan->supports_pools))))
-    return ier;
+  RETURN_IF_CUDA_ERROR
+
+  cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream,
+                    d_plan->supports_pools);
+  RETURN_IF_CUDA_ERROR
+
   map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
       d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-    cudaFree(d_subprob_to_bin);
-    return FINUFFT_ERR_CUDA_FAILURE;
-  }
-
-  assert(d_subprob_to_bin != NULL);
+  RETURN_IF_CUDA_ERROR
+  assert(d_subprob_to_bin != nullptr);
   cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools);
   d_plan->subprob_to_bin  = d_subprob_to_bin;
   d_plan->totalnumsubprob = totalnumsubprob;
@@ -251,15 +261,18 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 
   T sigma = d_plan->opts.upsampfac;
 
-  size_t sharedplanorysize =
-      (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  const auto sharedplanorysize =
+      shared_memory_required<T>(1, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_1d_subprob<T, 1>, 1, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
           d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
@@ -268,6 +281,11 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_1d_subprob<T, 0>, 1, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
           d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh
index 24b4fb9d2..72c776c06 100644
--- a/src/cuda/1d/spreadinterp1d.cuh
+++ b/src/cuda/1d/spreadinterp1d.cuh
@@ -8,6 +8,9 @@
 #include <cufinufft/defs.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/utils.h>
+
+#include <thrust/sort.h>
+
 using namespace cufinufft::utils;
 
 namespace cufinufft {
@@ -15,164 +18,173 @@ namespace spreadinterp {
 /* ------------------------ 1d Spreading Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
-                                      T es_c, T es_beta, T sigma, const int *idxnupts) {
-    int xx, ix;
-    T ker1[MAX_NSPREAD];
-
-    T x_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-        cnow = c[idxnupts[i]];
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        if constexpr (KEREVALMETH == 1)
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-        else
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-
-        for (xx = xstart; xx <= xend; xx++) {
-            ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-            T kervalue = ker1[xx - xstart];
-            atomicAdd(&fw[ix].x, cnow.x * kervalue);
-            atomicAdd(&fw[ix].y, cnow.y * kervalue);
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c,
+                                      cuda_complex<T> *fw, int M, int ns, int nf1, T es_c,
+                                      T es_beta, T sigma, const int *idxnupts) {
+  // dynamic stack allocation to reduce stack usage
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns);
+  auto *__restrict__ ker1 = ker;
+#else
+  T ker1[MAX_NSPREAD];
+#endif
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
+    const auto cnow           = c[idxnupts[i]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = (T)xstart - x_rescaled;
+    if constexpr (KEREVALMETH == 1)
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+    else
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+
+    for (auto xx = xstart; xx <= xend; xx++) {
+      auto ix          = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+      const T kervalue = ker1[xx - xstart];
+      atomicAdd(&fw[ix].x, cnow.x * kervalue);
+      atomicAdd(&fw[ix].y, cnow.y * kervalue);
     }
+  }
 }
 
 /* Kernels for SubProb Method */
 // SubProb properties
-template <typename T>
-__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x,
-                                         int *sortidx) {
-    int binx;
-    int oldidx;
-    T x_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        oldidx = atomicAdd(&bin_size[binx], 1);
-        sortidx[i] = oldidx;
-        if (binx >= nbinx) {
-            sortidx[i] = -binx;
-        }
+template<typename T>
+__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx,
+                                         int *bin_size, const T *x, int *sortidx) {
+  int binx;
+  int oldidx;
+  T x_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    oldidx     = atomicAdd(&bin_size[binx], 1);
+    sortidx[i] = oldidx;
+    if (binx >= nbinx) {
+      sortidx[i] = -binx;
     }
+  }
 }
 
-template <typename T>
-__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts,
-                                                   const int *sortidx, const T *x, int *index, int nf1) {
-    int binx;
-    T x_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-
-        index[bin_startpts[binx] + sortidx[i]] = i;
-    }
+template<typename T>
+__global__ void calc_inverse_of_global_sort_idx_1d(
+    int M, int bin_size_x, int nbinx, const int *bin_startpts, const int *sortidx,
+    const T *x, int *index, int nf1) {
+  int binx;
+  T x_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+
+    index[bin_startpts[binx] + sortidx[i]] = i;
+  }
 }
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
-                                  T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size,
-                                  int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, xend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0));
-    T ker1[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-
-    T x_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        if constexpr (KEREVALMETH == 1)
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-        else
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-
-        for (int xx = xstart; xx <= xend; xx++) {
-            ix = xx + ceil(ns / 2.0);
-            if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                break;
-            atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]);
-            atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]);
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_1d_subprob(
+    const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, uint8_t ns, int nf1,
+    T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size,
+    int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  auto *__restrict__ fwshared = (cuda_complex<T> *)sharedbuf;
+
+  const int subpidx     = blockIdx.x;
+  const int bidx        = subprob_to_bin[subpidx];
+  const int binsubp_idx = subpidx - subprobstartpts[bidx];
+  const int ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const int nupts   = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+  const int xoffset = (bidx % nbinx) * bin_size_x;
+  const auto ns_2   = (ns + 1) / 2;
+  const int N       = bin_size_x + 2 * ns_2;
+
+  // dynamic stack allocation
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns);
+  auto *__restrict__ ker1 = ker;
+#else
+  T ker1[MAX_NSPREAD];
+#endif
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    fwshared[i] = {0, 0};
+  }
+
+  const T ns_2f = ns * T(.5);
+
+  __syncthreads();
+
+  for (auto i = threadIdx.x; i < nupts; i += blockDim.x) {
+    const auto idx            = ptstart + i;
+    const auto x_rescaled     = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto cnow           = c[idxnupts[idx]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const T x1                = T(xstart + xoffset) - x_rescaled;
+    if constexpr (KEREVALMETH == 1)
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+    else
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+    for (int xx = xstart; xx <= xend; xx++) {
+      const auto ix = xx + ns_2;
+      if (ix >= (bin_size_x + ns_2) || ix < 0) break;
+      const cuda_complex<T> result{cnow.x * ker1[xx - xstart],
+                                   cnow.y * ker1[xx - xstart]};
+      atomicAddComplexShared<T>(fwshared + ix, result);
     }
-    __syncthreads();
-    /* write to global memory */
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        ix = xoffset - ceil(ns / 2.0) + k;
-        if (ix < (nf1 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            atomicAdd(&fw[ix].x, fwshared[k].x);
-            atomicAdd(&fw[ix].y, fwshared[k].y);
-        }
+  }
+  __syncthreads();
+  /* write to global memory */
+  for (int k = threadIdx.x; k < N; k += blockDim.x) {
+    auto ix = xoffset - ns_2 + k;
+    if (ix < (nf1 + ns_2)) {
+      ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      atomicAddComplexGlobal<T>(fw + ix, fwshared[k]);
     }
+  }
 }
 
 /* --------------------- 1d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns, int nf1,
+template<typename T, int KEREVALMETH>
+__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c,
+                                      const cuda_complex<T> *fw, int M, int ns, int nf1,
                                       T es_c, T es_beta, T sigma, const int *idxnupts) {
-    T ker1[MAX_NSPREAD];
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        T x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        T x1 = (T)xstart - x_rescaled;
-        if constexpr (KEREVALMETH == 1)
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-        else
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-
-        for (int xx = xstart; xx <= xend; xx++) {
-            int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-            T kervalue1 = ker1[xx - xstart];
-            cnow.x += fw[ix].x * kervalue1;
-            cnow.y += fw[ix].y * kervalue1;
-        }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
+  // dynamic stack allocation
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns);
+  auto *__restrict__ ker1 = ker;
+#else
+  T ker1[MAX_NSPREAD];
+#endif
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const T x_rescaled        = fold_rescale(x[idxnupts[i]], nf1);
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+
+    cuda_complex<T> cnow{0, 0};
+
+    const T x1 = (T)xstart - x_rescaled;
+    if constexpr (KEREVALMETH == 1)
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+    else
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+    for (int xx = xstart; xx <= xend; xx++) {
+      int ix            = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+      const T kervalue1 = ker1[xx - xstart];
+      cnow.x += fw[ix].x * kervalue1;
+      cnow.y += fw[ix].y * kervalue1;
     }
+    c[idxnupts[i]] = cnow;
+  }
 }
 
 } // namespace spreadinterp
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index afc801b7f..f7f7b1559 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -1,14 +1,10 @@
-#include <assert.h>
+#include <cassert>
 #include <cmath>
 #include <complex>
-#include <iomanip>
-#include <iostream>
-
 #include <cufft.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/cudeconvolve.h>
-#include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 
 using namespace cufinufft::deconvolve;
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index 533788482..0d3d3ff9b 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -1,13 +1,12 @@
-#include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
-#include <cufinufft/memtransfer.h>
+#include <cufinufft/common.h>
 #include <cufinufft/spreadinterp.h>
 
-using namespace cufinufft::memtransfer;
+using namespace cufinufft::common;
 
 #include "spreadinterp2d.cuh"
 
@@ -120,17 +119,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
   int *d_subprob_to_bin  = d_plan->subprob_to_bin;
   int totalnumsubprob    = d_plan->totalnumsubprob;
 
-  T sigma                  = d_plan->opts.upsampfac;
-  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  T sigma = d_plan->opts.upsampfac;
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 1>, 2, *d_plan);
       interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -140,6 +136,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      cufinufft_set_shared_memory(interp_2d_subprob<T, 0>, 2, *d_plan);
       interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 69b2ba956..80cf9f8e9 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -1,5 +1,4 @@
 #include <cassert>
-#include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
@@ -7,14 +6,13 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
-#include <cufinufft/memtransfer.h>
+#include <cufinufft/common.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
 
 #include "spreadinterp2d.cuh"
 
 using namespace cufinufft::common;
-using namespace cufinufft::memtransfer;
 
 namespace cufinufft {
 namespace spreadinterp {
@@ -273,16 +271,17 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 
   T sigma = d_plan->opts.upsampfac;
 
-  size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * (int)ceil(ns / 2.0)) *
-                             sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  const auto sharedplanorysize =
+      shared_memory_required<T>(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   if (d_plan->opts.gpu_kerevalmeth) {
     for (int t = 0; t < blksize; t++) {
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_2d_subprob<T, 1>, 2, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
@@ -292,6 +291,11 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     }
   } else {
     for (int t = 0; t < blksize; t++) {
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_2d_subprob<T, 0>, 2, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
           sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh
index 558984ea1..53a243e7e 100644
--- a/src/cuda/2d/spreadinterp2d.cuh
+++ b/src/cuda/2d/spreadinterp2d.cuh
@@ -15,314 +15,330 @@ namespace spreadinterp {
 /* ------------------------ 2d Spreading Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
-    int xstart, ystart, xend, yend;
-    int xx, yy, ix, iy;
-    int outidx;
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled;
-    T kervalue1, kervalue2;
-    cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-      x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-        cnow = c[idxnupts[i]];
-
-        xstart = ceil(x_rescaled - ns / 2.0);
-        ystart = ceil(y_rescaled - ns / 2.0);
-        xend = floor(x_rescaled + ns / 2.0);
-        yend = floor(y_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (yy = ystart; yy <= yend; yy++) {
-            for (xx = xstart; xx <= xend; xx++) {
-                ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                outidx = ix + iy * nf1;
-                kervalue1 = ker1[xx - xstart];
-                kervalue2 = ker2[yy - ystart];
-                atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2);
-                atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2);
-            }
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_2d_nupts_driven(
+    const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+#endif
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled     = fold_rescale(y[idxnupts[i]], nf2);
+    const auto cnow           = c[idxnupts[i]];
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+
+    const auto x1 = (T)xstart - x_rescaled;
+    const auto y1 = (T)ystart - y_rescaled;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
+
+    for (auto yy = ystart; yy <= yend; yy++) {
+      for (auto xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+        const auto outidx    = ix + iy * nf1;
+        const auto kervalue1 = ker1[xx - xstart];
+        const auto kervalue2 = ker2[yy - ystart];
+        const cuda_complex<T> res{cnow.x * kervalue1 * kervalue2,
+                                  cnow.y * kervalue1 * kervalue2};
+        atomicAddComplexGlobal<T>(fw + outidx, res);
+      }
+    }
+  }
 }
 
 /* Kernels for SubProb Method */
 // SubProb properties
-template <typename T>
-__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
+template<typename T>
+__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x,
+                                         int bin_size_y, int nbinx, int nbiny,
                                          int *bin_size, T *x, T *y, int *sortidx) {
-    int binidx, binx, biny;
-    int oldidx;
-    T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-      x_rescaled = fold_rescale(x[i], nf1);
-      y_rescaled = fold_rescale(y[i], nf2);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-        binidx = binx + biny * nbinx;
-        oldidx = atomicAdd(&bin_size[binidx], 1);
-        sortidx[i] = oldidx;
-        if (binx >= nbinx || biny >= nbiny) {
-            sortidx[i] = -biny;
-        }
+  int binidx, binx, biny;
+  int oldidx;
+  T x_rescaled, y_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    biny       = floor(y_rescaled / bin_size_y);
+    biny       = biny >= nbiny ? biny - 1 : biny;
+    biny       = biny < 0 ? 0 : biny;
+    binidx     = binx + biny * nbinx;
+    oldidx     = atomicAdd(&bin_size[binidx], 1);
+    sortidx[i] = oldidx;
+    if (binx >= nbinx || biny >= nbiny) {
+      sortidx[i] = -biny;
     }
+  }
 }
 
-template <typename T>
-__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
-                                                     const int *bin_startpts, const int *sortidx, const T *x,
-                                                     const T *y, int *index, int nf1, int nf2) {
-    int binx, biny;
-    int binidx;
-    T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-      x_rescaled = fold_rescale(x[i], nf1);
-      y_rescaled = fold_rescale(y[i], nf2);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-        binidx = binx + biny * nbinx;
-
-        index[bin_startpts[binidx] + sortidx[i]] = i;
-    }
+template<typename T>
+__global__ void calc_inverse_of_global_sort_index_2d(
+    int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts,
+    const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) {
+  int binx, biny;
+  int binidx;
+  T x_rescaled, y_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    biny       = floor(y_rescaled / bin_size_y);
+    biny       = biny >= nbiny ? biny - 1 : biny;
+    biny       = biny < 0 ? 0 : biny;
+    binidx     = binx + biny * nbinx;
+
+    index[bin_startpts[binidx] + sortidx[i]] = i;
+  }
 }
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
-                                  int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
-                                  int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-      x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            iy = yy + ceil(ns / 2.0);
-            if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
-                break;
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                    break;
-                outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                T kervalue2 = ker2[yy - ystart];
-                atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2);
-                atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2);
-            }
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_2d_subprob(
+    const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
+    int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+    const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
+
+  const int subpidx      = blockIdx.x;
+  const auto bidx        = subprob_to_bin[subpidx];
+  const auto binsubp_idx = subpidx - subprobstartpts[bidx];
+  const auto ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const int xoffset = (bidx % nbinx) * bin_size_x;
+  const int yoffset = (bidx / nbinx) * bin_size_y;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+  const int N           = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+#endif
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    fwshared[i] = {0, 0};
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    const int idx         = ptstart + i;
+    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
+    const auto cnow       = c[idxnupts[idx]];
+    auto [xstart, xend]   = interval(ns, x_rescaled);
+    auto [ystart, yend]   = interval(ns, y_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    xstart -= xoffset;
+    ystart -= yoffset;
+    xend -= xoffset;
+    yend -= yoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
 
-    __syncthreads();
-    /* write to global memory */
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
-            atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
-        }
+    for (int yy = ystart; yy <= yend; yy++) {
+      const auto iy = yy + ns_2;
+      if (iy >= (bin_size_y + rounded_ns) || iy < 0) break;
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix = xx + ns_2;
+        if (ix >= (bin_size_x + rounded_ns) || ix < 0) break;
+        const auto outidx   = ix + iy * (bin_size_x + rounded_ns);
+        const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart];
+        const cuda_complex<T> res{cnow.x * kervalue, cnow.y * kervalue};
+        atomicAddComplexShared<T>(fwshared + outidx, res);
+      }
+    }
+  }
+
+  __syncthreads();
+  /* write to global memory */
+  for (int k = threadIdx.x; k < N; k += blockDim.x) {
+    const auto i = k % (bin_size_x + rounded_ns);
+    const auto j = k / (bin_size_x + rounded_ns);
+    auto ix      = xoffset - ns_2 + i;
+    auto iy      = yoffset - ns_2 + j;
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) {
+      ix                   = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                   = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      const auto outidx    = ix + iy * nf1;
+      const auto sharedidx = i + j * (bin_size_x + rounded_ns);
+      atomicAddComplexGlobal<T>(fw + outidx, fwshared[sharedidx]);
     }
+  }
 }
 
 /* --------------------- 2d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-      T x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-      T y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                int inidx = ix + iy * nf1;
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fw[inidx].x * kervalue1 * kervalue2;
-                cnow.y += fw[inidx].y * kervalue1 * kervalue2;
-            }
-        }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
+template<typename T, int KEREVALMETH>
+__global__ void interp_2d_nupts_driven(
+    const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) {
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+#endif
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled     = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled     = fold_rescale(y[idxnupts[i]], nf2);
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+
+    T x1 = (T)xstart - x_rescaled;
+    T y1 = (T)ystart - y_rescaled;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
     }
+
+    cuda_complex<T> cnow{0, 0};
+    for (int yy = ystart; yy <= yend; yy++) {
+      const T kervalue2 = ker2[yy - ystart];
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+        const auto iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+        const auto inidx     = ix + iy * nf1;
+        const auto kervalue1 = ker1[xx - xstart];
+        cnow.x += fw[inidx].x * kervalue1 * kervalue2;
+        cnow.y += fw[inidx].y * kervalue1 * kervalue2;
+      }
+    }
+    c[idxnupts[i]] = cnow;
+  }
 }
 
 /* Kernels for Subprob Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
-                                  int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
-                                  int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
-                                  const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            fwshared[sharedidx].x = fw[outidx].x;
-            fwshared[sharedidx].y = fw[outidx].y;
-        }
+template<typename T, int KEREVALMETH>
+__global__ void interp_2d_subprob(
+    const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
+    int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
+    int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+    const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
+
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 2);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+#endif
+
+  const auto subpidx     = blockIdx.x;
+  const auto bidx        = subprob_to_bin[subpidx];
+  const auto binsubp_idx = subpidx - subprobstartpts[bidx];
+  const auto ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const auto xoffset = (bidx % nbinx) * bin_size_x;
+  const auto yoffset = (bidx / nbinx) * bin_size_y;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+  const int N           = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+
+  for (int k = threadIdx.x; k < N; k += blockDim.x) {
+    int i   = k % (bin_size_x + rounded_ns);
+    int j   = k / (bin_size_x + rounded_ns);
+    auto ix = xoffset - ns_2 + i;
+    auto iy = yoffset - ns_2 + j;
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) {
+      ix                   = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                   = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      const auto outidx    = ix + iy * nf1;
+      const auto sharedidx = i + j * (bin_size_x + rounded_ns);
+      fwshared[sharedidx]  = fw[outidx];
     }
-    __syncthreads();
-
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-      x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-      y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        }
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                iy = yy + ceil(ns / 2.0);
-                outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
-                cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
-            }
-        }
-        c[idxnupts[idx]] = cnow;
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    int idx               = ptstart + i;
+    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
+    cuda_complex<T> cnow{0, 0};
+
+    auto [xstart, xend] = interval(ns, x_rescaled);
+    auto [ystart, yend] = interval(ns, y_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+
+    xstart -= xoffset;
+    ystart -= yoffset;
+    xend -= xoffset;
+    yend -= yoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+    }
+
+    for (int yy = ystart; yy <= yend; yy++) {
+      const auto kervalue2 = ker2[yy - ystart];
+      for (int xx = xstart; xx <= xend; xx++) {
+        const auto ix        = xx + ns_2;
+        const auto iy        = yy + ns_2;
+        const auto outidx    = ix + iy * (bin_size_x + rounded_ns);
+        const auto kervalue1 = ker1[xx - xstart];
+        cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
+        cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
+      }
     }
+    c[idxnupts[idx]] = cnow;
+  }
 }
 
 } // namespace spreadinterp
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index ea0ef4a86..5977e6d5f 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -1,13 +1,10 @@
 #include <cmath>
 #include <complex>
-#include <iomanip>
-#include <iostream>
 
 #include <cufft.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/cudeconvolve.h>
-#include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/types.h>
 
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index b42231d86..91379d3ae 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -1,15 +1,15 @@
-#include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
+#include "spreadinterp3d.cuh"
+#include <cufinufft/common.h>
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
 
-#include "spreadinterp3d.cuh"
-
 using namespace cufinufft::memtransfer;
+using namespace cufinufft::common;
 
 namespace cufinufft {
 namespace spreadinterp {
@@ -123,19 +123,16 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
   int *d_subprob_to_bin  = d_plan->subprob_to_bin;
   int totalnumsubprob    = d_plan->totalnumsubprob;
 
-  T sigma                  = d_plan->spopts.upsampfac;
-  T es_c                   = d_plan->spopts.ES_c;
-  T es_beta                = d_plan->spopts.ES_beta;
-  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * ceil(ns / 2.0)) *
-                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
+  T sigma   = d_plan->spopts.upsampfac;
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  const auto sharedplanorysize =
+      shared_memory_required<T>(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
 
   for (int t = 0; t < blksize; t++) {
     if (d_plan->opts.gpu_kerevalmeth == 1) {
+      cufinufft_set_shared_memory(interp_3d_subprob<T, 1>, 3, *d_plan);
       interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
           es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
@@ -143,6 +140,7 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
           numbins[0], numbins[1], numbins[2], d_idxnupts);
       RETURN_IF_CUDA_ERROR
     } else {
+      cufinufft_set_shared_memory(interp_3d_subprob<T, 0>, 3, *d_plan);
       interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
           es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index fa67f95f8..475a888ac 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -1,5 +1,4 @@
 #include <cassert>
-#include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
@@ -7,11 +6,11 @@
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
-#include <cufinufft/memtransfer.h>
+#include <cufinufft/common.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
+
 using namespace cufinufft::common;
-using namespace cufinufft::memtransfer;
 
 #include "spreadinterp3d.cuh"
 
@@ -530,20 +529,19 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
   int totalnumsubprob   = d_plan->totalnumsubprob;
   int *d_subprob_to_bin = d_plan->subprob_to_bin;
 
-  T sigma                  = d_plan->spopts.upsampfac;
-  T es_c                   = d_plan->spopts.ES_c;
-  T es_beta                = d_plan->spopts.ES_beta;
-  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
-                             (bin_size_y + 2 * ceil(ns / 2.0)) *
-                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-  if (sharedplanorysize > 49152) {
-    std::cerr << "[cuspread3d_subprob] error: not enough shared memory ("
-              << sharedplanorysize << ")" << std::endl;
-    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-  }
-
+  T sigma   = d_plan->spopts.upsampfac;
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  const auto sharedplanorysize =
+      shared_memory_required<T>(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+                                d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
   for (int t = 0; t < blksize; t++) {
     if (d_plan->opts.gpu_kerevalmeth) {
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_3d_subprob<T, 1>, 3, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
           sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
@@ -551,6 +549,11 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
           numbins[0], numbins[1], numbins[2], d_idxnupts);
       RETURN_IF_CUDA_ERROR
     } else {
+      if (const auto finufft_err =
+              cufinufft_set_shared_memory(spread_3d_subprob<T, 0>, 3, *d_plan) != 0) {
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+      }
+      RETURN_IF_CUDA_ERROR
       spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
           d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
           sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh
index 838816a56..59b4661ff 100644
--- a/src/cuda/3d/spreadinterp3d.cuh
+++ b/src/cuda/3d/spreadinterp3d.cuh
@@ -4,554 +4,602 @@
 #include <cuda.h>
 #include <cufinufft/contrib/helper_cuda.h>
 
+#include <cuda_runtime.h>
 #include <cufinufft/defs.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/types.h>
 #include <cufinufft/utils.h>
 
+using namespace cufinufft::utils;
+
 namespace cufinufft {
 namespace spreadinterp {
 /* ---------------------- 3d Spreading Kernels -------------------------------*/
 /* Kernels for bin sort NUpts */
 
-template <typename T>
-__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y,
-                                         int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x,
+template<typename T>
+__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x,
+                                         int bin_size_y, int bin_size_z, int nbinx,
+                                         int nbiny, int nbinz, int *bin_size, const T *x,
                                          const T *y, const T *z, int *sortidx) {
-    int binidx, binx, biny, binz;
-    int oldidx;
-    T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        y_rescaled = fold_rescale(y[i], nf2);
-        z_rescaled = fold_rescale(z[i], nf3);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-
-        binz = floor(z_rescaled / bin_size_z);
-        binz = binz >= nbinz ? binz - 1 : binz;
-        binz = binz < 0 ? 0 : binz;
-        binidx = binx + biny * nbinx + binz * nbinx * nbiny;
-        oldidx = atomicAdd(&bin_size[binidx], 1);
-        sortidx[i] = oldidx;
-    }
+  int binidx, binx, biny, binz;
+  int oldidx;
+  T x_rescaled, y_rescaled, z_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    z_rescaled = fold_rescale(z[i], nf3);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+
+    biny = floor(y_rescaled / bin_size_y);
+    biny = biny >= nbiny ? biny - 1 : biny;
+    biny = biny < 0 ? 0 : biny;
+
+    binz       = floor(z_rescaled / bin_size_z);
+    binz       = binz >= nbinz ? binz - 1 : binz;
+    binz       = binz < 0 ? 0 : binz;
+    binidx     = binx + biny * nbinx + binz * nbinx * nbiny;
+    oldidx     = atomicAdd(&bin_size[binidx], 1);
+    sortidx[i] = oldidx;
+  }
 }
 
-template <typename T>
-__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx,
-                                                     int nbiny, int nbinz, const int *bin_startpts, const int *sortidx,
-                                                     const T *x, const T *y, const T *z, int *index,
-                                                     int nf1, int nf2, int nf3) {
-    int binx, biny, binz;
-    int binidx;
-    T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        y_rescaled = fold_rescale(y[i], nf2);
-        z_rescaled = fold_rescale(z[i], nf3);
-        binx = floor(x_rescaled / bin_size_x);
-        binx = binx >= nbinx ? binx - 1 : binx;
-        binx = binx < 0 ? 0 : binx;
-        biny = floor(y_rescaled / bin_size_y);
-        biny = biny >= nbiny ? biny - 1 : biny;
-        biny = biny < 0 ? 0 : biny;
-        binz = floor(z_rescaled / bin_size_z);
-        binz = binz >= nbinz ? binz - 1 : binz;
-        binz = binz < 0 ? 0 : binz;
-        binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz);
-
-        index[bin_startpts[binidx] + sortidx[i]] = i;
-    }
+template<typename T>
+__global__ void calc_inverse_of_global_sort_index_3d(
+    int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny,
+    int nbinz, const int *bin_startpts, const int *sortidx, const T *x, const T *y,
+    const T *z, int *index, int nf1, int nf2, int nf3) {
+  int binx, biny, binz;
+  int binidx;
+  T x_rescaled, y_rescaled, z_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    z_rescaled = fold_rescale(z[i], nf3);
+    binx       = floor(x_rescaled / bin_size_x);
+    binx       = binx >= nbinx ? binx - 1 : binx;
+    binx       = binx < 0 ? 0 : binx;
+    biny       = floor(y_rescaled / bin_size_y);
+    biny       = biny >= nbiny ? biny - 1 : biny;
+    biny       = biny < 0 ? 0 : biny;
+    binz       = floor(z_rescaled / bin_size_z);
+    binz       = binz >= nbinz ? binz - 1 : binz;
+    binz       = binz < 0 ? 0 : binz;
+    binidx     = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz);
+
+    index[bin_startpts[binidx] + sortidx[i]] = i;
+  }
 }
 
 /* Kernels for NUptsdriven method */
-template <typename T, int KEREVALMETH>
-__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
-                                       T sigma, const int *idxnupts) {
-    int xx, yy, zz, ix, iy, iz;
-    int outidx;
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-
-    T ker1val, ker2val, ker3val;
-
-    T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-        y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-        z_rescaled = fold_rescale(z[idxnupts[i]], nf3);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int zstart = ceil(z_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        int zend = floor(z_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        T z1 = (T)zstart - z_rescaled;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, x1, ns, sigma);
-            eval_kernel_vec_horner(ker2, y1, ns, sigma);
-            eval_kernel_vec_horner(ker3, z1, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-            eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
-        }
+template<typename T, int KEREVALMETH>
+__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z,
+                                       const cuda_complex<T> *c, cuda_complex<T> *fw,
+                                       int M, int ns, int nf1, int nf2, int nf3, T es_c,
+                                       T es_beta, T sigma, const int *idxnupts) {
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 3);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  auto *__restrict__ ker3 = ker + ns + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+  T ker3[MAX_NSPREAD];
+#endif
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
+    const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3);
+
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+    const auto [zstart, zend] = interval(ns, z_rescaled);
+
+    const auto x1 = T(xstart) - x_rescaled;
+    const auto y1 = T(ystart) - y_rescaled;
+    const auto z1 = T(zstart) - z_rescaled;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+      eval_kernel_vec_horner(ker3, z1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+      eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
+    }
 
-        for (zz = zstart; zz <= zend; zz++) {
-            ker3val = ker3[zz - zstart];
-            for (yy = ystart; yy <= yend; yy++) {
-                ker2val = ker2[yy - ystart];
-                for (xx = xstart; xx <= xend; xx++) {
-                    ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                    iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                    iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
-                    outidx = ix + iy * nf1 + iz * nf1 * nf2;
-                    ker1val = ker1[xx - xstart];
-                    T kervalue = ker1val * ker2val * ker3val;
-                    atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue);
-                    atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue);
-                }
-            }
+    for (int zz = zstart; zz <= zend; zz++) {
+      const auto ker3val = ker3[zz - zstart];
+      for (int yy = ystart; yy <= yend; yy++) {
+        const auto ker2val = ker2[yy - ystart];
+        for (int xx = xstart; xx <= xend; xx++) {
+          const int ix        = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+          const int iy        = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+          const int iz        = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
+          const int outidx    = ix + iy * nf1 + iz * nf1 * nf2;
+          const auto ker1val  = ker1[xx - xstart];
+          const auto kervalue = ker1val * ker2val * ker3val;
+          const cuda_complex<T> res{c[idxnupts[i]].x * kervalue,
+                                    c[idxnupts[i]].y * kervalue};
+          atomicAddComplexGlobal<T>(fw + outidx, res);
         }
+      }
     }
+  }
 }
 
 /* Kernels for Subprob method */
-template <typename T, int KEREVALMETH>
-__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
-                                  int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size,
-                                  int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin,
-                                  int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
-                                  int nbinz, int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    const int bidx = subprob_to_bin[blockIdx.x];
-    const int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
-    const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    const int xoffset = (bidx % nbinx) * bin_size_x;
-    const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
-    const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
+template<typename T, int KEREVALMETH>
+__global__ void spread_3d_subprob(
+    T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
+    int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size,
+    int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin,
+    int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+    int nbinz, int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  auto fwshared = (cuda_complex<T> *)sharedbuf;
+
+  const int bidx        = subprob_to_bin[blockIdx.x];
+  const int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
+  const int ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const int xoffset = (bidx % nbinx) * bin_size_x;
+  const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
+  const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+
+  const int N =
+      (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns);
+
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    fwshared[i] = {0, 0};
+  }
+  __syncthreads();
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 3);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  auto *__restrict__ ker3 = ker + ns + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+  T ker3[MAX_NSPREAD];
+#endif
+
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    const int nuptsidx    = idxnupts[ptstart + i];
+    const auto x_rescaled = fold_rescale(x[nuptsidx], nf1);
+    const auto y_rescaled = fold_rescale(y[nuptsidx], nf2);
+    const auto z_rescaled = fold_rescale(z[nuptsidx], nf3);
+    const auto cnow       = c[nuptsidx];
+    auto [xstart, xend]   = interval(ns, x_rescaled);
+    auto [ystart, yend]   = interval(ns, y_rescaled);
+    auto [zstart, zend]   = interval(ns, z_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    const T z1 = T(zstart) - z_rescaled;
+
+    xstart -= xoffset;
+    ystart -= yoffset;
+    zstart -= zoffset;
+
+    xend -= xoffset;
+    yend -= yoffset;
+    zend -= zoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+      eval_kernel_vec_horner(ker3, z1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+      eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
     }
-    __syncthreads();
-
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-        T ker3[MAX_NSPREAD];
-
-        const int nuptsidx = idxnupts[ptstart + i];
-        const T x_rescaled = fold_rescale(x[nuptsidx], nf1);
-        const T y_rescaled = fold_rescale(y[nuptsidx], nf2);
-        const T z_rescaled = fold_rescale(z[nuptsidx], nf3);
-        cuda_complex<T> cnow = c[nuptsidx];
-
-        const int xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        const int ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        const int zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-
-        const int xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        const int yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        const int zend = floor(z_rescaled + ns / 2.0) - zoffset;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
-        }
 
-        for (int zz = zstart; zz <= zend; zz++) {
-            const T kervalue3 = ker3[zz - zstart];
-            const int iz = zz + ceil(ns / 2.0);
-            if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0)
-                break;
-            for (int yy = ystart; yy <= yend; yy++) {
-                const T kervalue2 = ker2[yy - ystart];
-                const int iy = yy + ceil(ns / 2.0);
-                if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
-                    break;
-                for (int xx = xstart; xx <= xend; xx++) {
-                    const int ix = xx + ceil(ns / 2.0);
-                    if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                        break;
-                    const int outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
-                                       iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-                    const T kervalue1 = ker1[xx - xstart];
-                    atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3);
-                    atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3);
-                }
-            }
+    for (int zz = zstart; zz <= zend; zz++) {
+      const T kervalue3 = ker3[zz - zstart];
+      const int iz      = zz + ns_2;
+      if (iz >= (bin_size_z + (int)rounded_ns) || iz < 0) break;
+      for (int yy = ystart; yy <= yend; yy++) {
+        const T kervalue2 = ker2[yy - ystart];
+        const int iy      = yy + ns_2;
+        if (iy >= (bin_size_y + (int)rounded_ns) || iy < 0) break;
+        for (int xx = xstart; xx <= xend; xx++) {
+          const int ix = xx + ns_2;
+          if (ix >= (bin_size_x + (int)rounded_ns) || ix < 0) break;
+          const int outidx = ix + iy * (bin_size_x + rounded_ns) +
+                             iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+          const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3;
+          const cuda_complex<T> res{cnow.x * kervalue, cnow.y * kervalue};
+          atomicAddComplexShared<T>(fwshared + outidx, res);
         }
+      }
     }
-    __syncthreads();
-
-    /* write to global memory */
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        const int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        const int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
-        const int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
-
-        int ix = xoffset - ceil(ns / 2.0) + i;
-        int iy = yoffset - ceil(ns / 2.0) + j;
-        int iz = zoffset - ceil(ns / 2.0) + k;
-
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
-            const int outidx = ix + iy * nf1 + iz * nf1 * nf2;
-            const int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
-                                  k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-            atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
-            atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
-        }
+  }
+  __syncthreads();
+
+  /* write to global memory */
+  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+    const int i = n % (bin_size_x + rounded_ns);
+    const int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns);
+    const int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns));
+
+    int ix = xoffset - ns_2 + i;
+    int iy = yoffset - ns_2 + j;
+    int iz = zoffset - ns_2 + k;
+
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) {
+      ix                  = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                  = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      iz                  = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
+      const int outidx    = ix + iy * nf1 + iz * nf1 * nf2;
+      const int sharedidx = i + j * (bin_size_x + rounded_ns) +
+                            k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+      atomicAddComplexGlobal<T>(fw + outidx, fwshared[sharedidx]);
     }
+  }
 }
 
 /* Kernels for BlockGather Method */
-template <typename T>
-__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx,
-                                           int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz,
-                                           int *bin_size, const T *x, const T *y, const T *z, int *sortidx,
-                                           int nf1, int nf2, int nf3) {
-    int binidx, binx, biny, binz;
-    int oldidx;
-    T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        y_rescaled = fold_rescale(y[i], nf2);
-        z_rescaled = fold_rescale(z[i], nf3);
-        binx = floor(x_rescaled / bin_size_x);
-        biny = floor(y_rescaled / bin_size_y);
-        binz = floor(z_rescaled / bin_size_z);
-        binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1);
-        biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1);
-        binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1);
-
-        binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny,
-                                           binsperobinz);
-        oldidx = atomicAdd(&bin_size[binidx], 1);
-        sortidx[i] = oldidx;
-    }
+template<typename T>
+__global__ void locate_nupts_to_bins_ghost(
+    int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny,
+    int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size,
+    const T *x, const T *y, const T *z, int *sortidx, int nf1, int nf2, int nf3) {
+  int binidx, binx, biny, binz;
+  int oldidx;
+  T x_rescaled, y_rescaled, z_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    z_rescaled = fold_rescale(z[i], nf3);
+    binx       = floor(x_rescaled / bin_size_x);
+    biny       = floor(y_rescaled / bin_size_y);
+    binz       = floor(z_rescaled / bin_size_z);
+    binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1);
+    biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1);
+    binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1);
+
+    binidx     = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz,
+                                           binsperobinx, binsperobiny, binsperobinz);
+    oldidx     = atomicAdd(&bin_size[binidx], 1);
+    sortidx[i] = oldidx;
+  }
 }
 
-template <typename T>
-__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z,
-                                                        int nobinx, int nobiny, int nobinz, int binsperobinx,
-                                                        int binsperobiny, int binsperobinz, int *bin_startpts,
-                                                        const int *sortidx, const T *x, const T *y, const T *z,
-                                                        int *index, int nf1, int nf2, int nf3) {
-    int binx, biny, binz;
-    int binidx;
-    T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        x_rescaled = fold_rescale(x[i], nf1);
-        y_rescaled = fold_rescale(y[i], nf2);
-        z_rescaled = fold_rescale(z[i], nf3);
-        binx = floor(x_rescaled / bin_size_x);
-        biny = floor(y_rescaled / bin_size_y);
-        binz = floor(z_rescaled / bin_size_z);
-        binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1);
-        biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1);
-        binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1);
-
-        binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny,
-                                           binsperobinz);
-
-        index[bin_startpts[binidx] + sortidx[i]] = i;
-    }
+template<typename T>
+__global__ void calc_inverse_of_global_sort_index_ghost(
+    int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny,
+    int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_startpts,
+    const int *sortidx, const T *x, const T *y, const T *z, int *index, int nf1, int nf2,
+    int nf3) {
+  int binx, biny, binz;
+  int binidx;
+  T x_rescaled, y_rescaled, z_rescaled;
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    x_rescaled = fold_rescale(x[i], nf1);
+    y_rescaled = fold_rescale(y[i], nf2);
+    z_rescaled = fold_rescale(z[i], nf3);
+    binx       = floor(x_rescaled / bin_size_x);
+    biny       = floor(y_rescaled / bin_size_y);
+    binz       = floor(z_rescaled / bin_size_z);
+    binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1);
+    biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1);
+    binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1);
+
+    binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz,
+                                       binsperobinx, binsperobiny, binsperobinz);
+
+    index[bin_startpts[binidx] + sortidx[i]] = i;
+  }
 }
 
-template <typename T, int KEREVALMETH>
-__global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
-                                       T sigma, const int *binstartpts, int obin_size_x, int obin_size_y,
-                                       int obin_size_z, int binsperobin, int *subprob_to_bin,
-                                       const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny,
-                                       int nobinz, const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, zstart, xend, yend, zend;
-    int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew;
-    int subpidx = blockIdx.x;
-    int obidx = subprob_to_bin[subpidx];
-    int bidx = obidx * binsperobin;
-
-    int obinsubp_idx = subpidx - subprobstartpts[obidx];
-    int ix, iy, iz;
-    int outidx;
-    int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize;
-    int nupts =
-        min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - obinsubp_idx * maxsubprobsize);
-
-    int xoffset = (obidx % nobinx) * obin_size_x;
-    int yoffset = (obidx / nobinx) % nobiny * obin_size_y;
-    int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z;
-
-    int N = obin_size_x * obin_size_y * obin_size_z;
-
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
+template<typename T, int KEREVALMETH>
+__global__ void spread_3d_block_gather(
+    const T *x, const T *y, const T *z, const cuda_complex<T> *c, cuda_complex<T> *fw,
+    int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
+    const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z,
+    int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize,
+    int nobinx, int nobiny, int nobinz, const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
+  const int subpidx         = blockIdx.x;
+  const int obidx           = subprob_to_bin[subpidx];
+  const int bidx            = obidx * binsperobin;
+
+  const int obinsubp_idx = subpidx - subprobstartpts[obidx];
+  const int ptstart      = binstartpts[bidx] + obinsubp_idx * maxsubprobsize;
+  const int nupts =
+      min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] -
+                              obinsubp_idx * maxsubprobsize);
+
+  const int xoffset = (obidx % nobinx) * obin_size_x;
+  const int yoffset = (obidx / nobinx) % nobiny * obin_size_y;
+  const int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z;
+
+  const int N = obin_size_x * obin_size_y * obin_size_z;
+
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 3);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  auto *__restrict__ ker3 = ker + ns + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+  T ker3[MAX_NSPREAD];
+#endif
+  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    fwshared[i] = {0, 0};
+  }
+
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    int nidx = idxnupts[ptstart + i];
+    int b    = nidx / M;
+    int box[3];
+    for (int &d : box) {
+      d = b % 3;
+      if (d == 1) d = -1;
+      if (d == 2) d = 1;
+      b = b / 3;
+    }
+    const int ii          = nidx % M;
+    const auto x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1;
+    const auto y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2;
+    const auto z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3;
+    const auto cnow       = c[ii];
+    auto [xstart, xend]   = interval(ns, x_rescaled);
+    auto [ystart, yend]   = interval(ns, y_rescaled);
+    auto [zstart, zend]   = interval(ns, z_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    const T z1 = T(zstart) - z_rescaled;
+
+    xstart -= xoffset;
+    ystart -= yoffset;
+    zstart -= zoffset;
+
+    xend -= xoffset;
+    yend -= yoffset;
+    zend -= zoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+      eval_kernel_vec_horner(ker3, z1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+      eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
     }
-    __syncthreads();
-
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int nidx = idxnupts[ptstart + i];
-        int b = nidx / M;
-        int box[3];
-        for (int d = 0; d < 3; d++) {
-            box[d] = b % 3;
-            if (box[d] == 1)
-                box[d] = -1;
-            if (box[d] == 2)
-                box[d] = 1;
-            b = b / 3;
-        }
-        int ii = nidx % M;
-        x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1;
-        y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2;
-        z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3;
-        cnow = c[ii];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
-        }
 
-        xstartnew = xstart < 0 ? 0 : xstart;
-        ystartnew = ystart < 0 ? 0 : ystart;
-        zstartnew = zstart < 0 ? 0 : zstart;
-        xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend;
-        yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend;
-        zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend;
-
-        for (int zz = zstartnew; zz <= zendnew; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            for (int yy = ystartnew; yy <= yendnew; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                for (int xx = xstartnew; xx <= xendnew; xx++) {
-                    outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x;
-                    T kervalue1 = ker1[xx - xstart];
-                    atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3);
-                    atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3);
-                }
-            }
+    const auto xstartnew = xstart < 0 ? 0 : xstart;
+    const auto ystartnew = ystart < 0 ? 0 : ystart;
+    const auto zstartnew = zstart < 0 ? 0 : zstart;
+    const auto xendnew   = xend >= obin_size_x ? obin_size_x - 1 : xend;
+    const auto yendnew   = yend >= obin_size_y ? obin_size_y - 1 : yend;
+    const auto zendnew   = zend >= obin_size_z ? obin_size_z - 1 : zend;
+
+    for (int zz = zstartnew; zz <= zendnew; zz++) {
+      const T kervalue3 = ker3[zz - zstart];
+      for (int yy = ystartnew; yy <= yendnew; yy++) {
+        const T kervalue2 = ker2[yy - ystart];
+        for (int xx = xstartnew; xx <= xendnew; xx++) {
+          const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x;
+          const T kervalue1 = ker1[xx - xstart];
+          const cuda_complex<T> res{cnow.x * kervalue1 * kervalue2 * kervalue3,
+                                    cnow.y * kervalue1 * kervalue2 * kervalue3};
+          atomicAddComplexShared<T>(fwshared + outidx, res);
         }
+      }
     }
-    __syncthreads();
-    /* write to global memory */
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % obin_size_x;
-        int j = (n / obin_size_x) % obin_size_y;
-        int k = n / (obin_size_x * obin_size_y);
-
-        ix = xoffset + i;
-        iy = yoffset + j;
-        iz = zoffset + k;
-        outidx = ix + iy * nf1 + iz * nf1 * nf2;
-        atomicAdd(&fw[outidx].x, fwshared[n].x);
-        atomicAdd(&fw[outidx].y, fwshared[n].y);
-    }
+  }
+  __syncthreads();
+  /* write to global memory */
+  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+    int i = n % obin_size_x;
+    int j = (n / obin_size_x) % obin_size_y;
+    int k = n / (obin_size_x * obin_size_y);
+
+    const auto ix     = xoffset + i;
+    const auto iy     = yoffset + j;
+    const auto iz     = zoffset + k;
+    const auto outidx = ix + iy * nf1 + iz * nf1 * nf2;
+    atomicAdd(&fw[outidx].x, fwshared[n].x);
+    atomicAdd(&fw[outidx].y, fwshared[n].y);
+  }
 }
 
 /* ---------------------- 3d Interpolation Kernels ---------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex<T> *c,
-                                       const cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c,
-                                       T es_beta, T sigma, int *idxnupts) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        T x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
-        T y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
-        T z_rescaled = fold_rescale(z[idxnupts[i]], nf3);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int zstart = ceil(z_rescaled - ns / 2.0);
-
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        int zend = floor(z_rescaled + ns / 2.0);
-
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-        T ker3[MAX_NSPREAD];
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, xstart - x_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, ystart - y_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker3, zstart - z_rescaled, ns, es_c, es_beta);
-        }
+template<typename T, int KEREVALMETH>
+__global__ void interp_3d_nupts_driven(
+    const T *x, const T *y, const T *z, cuda_complex<T> *c, const cuda_complex<T> *fw,
+    int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) {
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 3);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  auto *__restrict__ ker3 = ker + ns + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+  T ker3[MAX_NSPREAD];
+#endif
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M;
+       i += blockDim.x * gridDim.x) {
+    const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2);
+    const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3);
+
+    const auto [xstart, xend] = interval(ns, x_rescaled);
+    const auto [ystart, yend] = interval(ns, y_rescaled);
+    const auto [zstart, zend] = interval(ns, z_rescaled);
+
+    const auto x1 = T(xstart) - x_rescaled;
+    const auto y1 = T(ystart) - y_rescaled;
+    const auto z1 = T(zstart) - z_rescaled;
+
+    cuda_complex<T> cnow{0, 0};
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+      eval_kernel_vec_horner(ker3, z1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+      eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
+    }
 
-        for (int zz = zstart; zz <= zend; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
-            for (int yy = ystart; yy <= yend; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                for (int xx = xstart; xx <= xend; xx++) {
-                    int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                    int inidx = ix + iy * nf1 + iz * nf2 * nf1;
-                    T kervalue1 = ker1[xx - xstart];
-                    cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3;
-                    cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3;
-                }
-            }
+    for (int zz = zstart; zz <= zend; zz++) {
+      const auto kervalue3 = ker3[zz - zstart];
+      int iz               = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
+      for (int yy = ystart; yy <= yend; yy++) {
+        const auto kervalue2 = ker2[yy - ystart];
+        int iy               = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
+        for (int xx = xstart; xx <= xend; xx++) {
+          const int ix         = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
+          const int inidx      = ix + iy * nf1 + iz * nf2 * nf1;
+          const auto kervalue1 = ker1[xx - xstart];
+          cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3;
+          cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3;
         }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
+      }
     }
+    c[idxnupts[i]] = cnow;
+  }
 }
 
 /* Kernels for SubProb Method */
-template <typename T, int KEREVALMETH>
-__global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex<T> *c, const cuda_complex<T> *fw,
-                                  int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
-                                  const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y,
-                                  int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts,
-                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz,
-                                  const int *idxnupts) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend, zstart, zend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy, iz;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
-    int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
-
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
-        int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
-
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        iz = zoffset - ceil(ns / 2.0) + k;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
-            outidx = ix + iy * nf1 + iz * nf1 * nf2;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
-                            k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-            fwshared[sharedidx].x = fw[outidx].x;
-            fwshared[sharedidx].y = fw[outidx].y;
-        }
+template<typename T, int KEREVALMETH>
+__global__ void interp_3d_subprob(
+    const T *x, const T *y, const T *z, cuda_complex<T> *c, const cuda_complex<T> *fw,
+    int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
+    const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y,
+    int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts,
+    const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz,
+    const int *idxnupts) {
+  extern __shared__ char sharedbuf[];
+  auto fwshared = (cuda_complex<T> *)sharedbuf;
+
+#if ALLOCA_SUPPORTED
+  auto ker                = (T *)alloca(sizeof(T) * ns * 3);
+  auto *__restrict__ ker1 = ker;
+  auto *__restrict__ ker2 = ker + ns;
+  auto *__restrict__ ker3 = ker + ns + ns;
+#else
+  T ker1[MAX_NSPREAD];
+  T ker2[MAX_NSPREAD];
+  T ker3[MAX_NSPREAD];
+#endif
+
+  const auto subpidx     = blockIdx.x;
+  const auto bidx        = subprob_to_bin[subpidx];
+  const auto binsubp_idx = subpidx - subprobstartpts[bidx];
+  const auto ptstart     = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+  const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+
+  const auto xoffset = (bidx % nbinx) * bin_size_x;
+  const auto yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
+  const auto zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
+
+  const T ns_2f         = ns * T(.5);
+  const auto ns_2       = (ns + 1) / 2;
+  const auto rounded_ns = ns_2 * 2;
+
+  const int N =
+      (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns);
+
+  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+    int i   = n % (bin_size_x + rounded_ns);
+    int j   = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns);
+    int k   = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns));
+    auto ix = xoffset - ns_2 + i;
+    auto iy = yoffset - ns_2 + j;
+    auto iz = zoffset - ns_2 + k;
+    if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) {
+      ix                = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
+      iy                = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
+      iz                = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
+      const auto outidx = ix + iy * nf1 + iz * nf1 * nf2;
+      int sharedidx     = i + j * (bin_size_x + rounded_ns) +
+                      k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+      fwshared[sharedidx] = fw[outidx];
+    }
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
+    const int idx         = ptstart + i;
+    const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
+    const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
+    const auto z_rescaled = fold_rescale(z[idxnupts[idx]], nf3);
+    cuda_complex<T> cnow{0, 0};
+
+    auto [xstart, xend] = interval(ns, x_rescaled);
+    auto [ystart, yend] = interval(ns, y_rescaled);
+    auto [zstart, zend] = interval(ns, z_rescaled);
+
+    const T x1 = T(xstart) - x_rescaled;
+    const T y1 = T(ystart) - y_rescaled;
+    const T z1 = T(zstart) - z_rescaled;
+
+    xstart -= xoffset;
+    ystart -= yoffset;
+    zstart -= zoffset;
+
+    xend -= xoffset;
+    yend -= yoffset;
+    zend -= zoffset;
+
+    if constexpr (KEREVALMETH == 1) {
+      eval_kernel_vec_horner(ker1, x1, ns, sigma);
+      eval_kernel_vec_horner(ker2, y1, ns, sigma);
+      eval_kernel_vec_horner(ker3, z1, ns, sigma);
+    } else {
+      eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+      eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+      eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
     }
-    __syncthreads();
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = fold_rescale(x[idxnupts[idx]], nf1);
-        y_rescaled = fold_rescale(y[idxnupts[idx]], nf2);
-        z_rescaled = fold_rescale(z[idxnupts[idx]], nf3);
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
-
-        if constexpr (KEREVALMETH == 1) {
-            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
-        } else {
-            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
-            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
-        }
 
-        for (int zz = zstart; zz <= zend; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            iz = zz + ceil(ns / 2.0);
-            for (int yy = ystart; yy <= yend; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                iy = yy + ceil(ns / 2.0);
-                for (int xx = xstart; xx <= xend; xx++) {
-                    ix = xx + ceil(ns / 2.0);
-                    outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
-                             iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-                    T kervalue1 = ker1[xx - xstart];
-                    cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3;
-                    cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3;
-                }
-            }
+    for (int zz = zstart; zz <= zend; zz++) {
+      const auto kervalue3 = ker3[zz - zstart];
+      const auto iz        = zz + ns_2;
+      for (int yy = ystart; yy <= yend; yy++) {
+        const auto kervalue2 = ker2[yy - ystart];
+        const auto iy        = yy + ns_2;
+        for (int xx = xstart; xx <= xend; xx++) {
+          const auto ix     = xx + ns_2;
+          const auto outidx = ix + iy * (bin_size_x + rounded_ns) +
+                              iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns);
+          const auto kervalue1 = ker1[xx - xstart];
+          cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3;
+          cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3;
         }
-        c[idxnupts[idx]].x = cnow.x;
-        c[idxnupts[idx]].y = cnow.y;
+      }
     }
+    c[idxnupts[idx]] = cnow;
+  }
 }
 
 } // namespace spreadinterp
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 77b86ae77..ae9431c31 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -1,7 +1,3 @@
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
 set(PRECISION_INDEPENDENT_SRC
     precision_independent.cu utils.cpp
     ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp)
@@ -47,8 +43,14 @@ target_include_directories(cufinufft_common_objects
 set_target_properties(
   cufinufft_common_objects
   PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
-             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES})
-
+             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+             CUDA_SEPARABLE_COMPILATION ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON)
+target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17)
+target_compile_options(
+  cufinufft_common_objects
+  PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
 target_compile_options(
   cufinufft_common_objects
   PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
@@ -58,24 +60,32 @@ target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
 set_target_properties(
   cufinufft_objects
   PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING}
-             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES})
+             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+             CUDA_SEPARABLE_COMPILATION ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON)
+target_compile_features(cufinufft_objects PRIVATE cxx_std_17)
 target_compile_options(
   cufinufft_objects PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${FINUFFT_CUDA_FLAGS}>)
 
 if(FINUFFT_SHARED_LINKING)
   add_library(cufinufft SHARED $<TARGET_OBJECTS:cufinufft_common_objects>
                                $<TARGET_OBJECTS:cufinufft_objects>)
-  set_target_properties(
-    cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-                         CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES})
 else()
   add_library(cufinufft STATIC $<TARGET_OBJECTS:cufinufft_common_objects>
                                $<TARGET_OBJECTS:cufinufft_objects>)
-  set_target_properties(
-    cufinufft PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-                         CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES})
 endif()
 
+set_target_properties(
+  cufinufft
+  PROPERTIES CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
+             CUDA_SEPARABLE_COMPILATION ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+             CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES})
+target_compile_features(cufinufft PRIVATE cxx_std_17)
+
 if(WIN32)
   target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft
                                          CUDA::nvToolsExt)
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index c6bf8315d..b19986520 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -199,6 +199,119 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<doubl
   }
 }
 
+template<typename T>
+std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
+                                   int bin_size_z) {
+  // Helper to compute the shared memory required for the spreader when using SM
+  int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2;
+
+  if (dim == 1) {
+    return adjusted_ns * sizeof(cuda_complex<T>);
+  }
+
+  adjusted_ns *= (bin_size_y + ((ns + 1) / 2) * 2);
+
+  if (dim == 2) {
+    return adjusted_ns * sizeof(cuda_complex<T>);
+  }
+
+  adjusted_ns *= (bin_size_z + ((ns + 1) / 2) * 2);
+
+  return adjusted_ns * sizeof(cuda_complex<T>);
+}
+
+// Function to find bin_size_x == bin_size_y
+// where bin_size_x * bin_size_y * bin_size_z < mem_size
+// TODO: this can be done without a loop by using a direct formula
+template<typename T> int find_bin_size(std::size_t mem_size, int dim, int ns) {
+  int binsize = 1; // Start with the smallest possible bin size
+  while (true) {
+    // Calculate the shared memory required for the current bin_size_x and bin_size_y
+    std::size_t required_memory =
+        shared_memory_required<T>(dim, ns, binsize, binsize, binsize);
+
+    // Check if the required memory is less than the available memory
+    if (required_memory > mem_size) {
+      // If the condition is met, return the current bin_size_x
+      return binsize - 1;
+    }
+
+    // Increment bin_size_x for the next iteration
+    binsize++;
+  }
+}
+
+template<typename T>
+void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
+  // Marco Barbone 07/26/24. Using the shared memory available on the device, to
+  // determine the optimal binsize for the spreader.
+  // WARNING: This function does not check for CUDA errors, the caller should check and
+  // handle them.
+  // TODO: This can still be improved some sizes are hardcoded still
+  int shared_mem_per_block{}, device_id{};
+  switch (dim) {
+  case 1: {
+    if (opts->gpu_binsizex == 0) {
+      cudaGetDevice(&device_id);
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      // CUDA error handled by the caller not checking them here.
+      // use 1/6 of the shared memory for the binsize
+      // From experiments on multiple GPUs this gives the best tradeoff.
+      // It is within 90% of the maximum performance for all GPUs tested.
+      shared_mem_per_block /= 6;
+      const int bin_size =
+          shared_mem_per_block / sizeof(cuda_complex<T>) - ((ns + 1) / 2) * 2;
+      opts->gpu_binsizex = bin_size;
+    }
+    opts->gpu_binsizey = 1;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 2: {
+    if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0) {
+      switch (opts->gpu_method) {
+      case 0:
+      case 2: {
+        opts->gpu_binsizex = 32;
+        opts->gpu_binsizey = 32;
+      } break;
+      case 1: {
+        cudaGetDevice(&device_id);
+        cudaDeviceGetAttribute(&shared_mem_per_block,
+                               cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+        const auto binsize = find_bin_size<T>(shared_mem_per_block, dim, ns);
+        // in 2D 1/6 is too small, it gets slower because of the excessive padding
+        opts->gpu_binsizex = binsize;
+        opts->gpu_binsizey = binsize;
+      } break;
+      }
+    }
+    opts->gpu_binsizez = 1;
+  } break;
+  case 3: {
+    switch (opts->gpu_method) {
+    case 0:
+    case 1:
+    case 2: {
+      if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0 || opts->gpu_binsizez == 0) {
+        opts->gpu_binsizex = 16;
+        opts->gpu_binsizey = 16;
+        opts->gpu_binsizez = 2;
+      }
+    } break;
+    case 4: {
+      opts->gpu_obinsizex = (opts->gpu_obinsizex == 0) ? 8 : opts->gpu_obinsizex;
+      opts->gpu_obinsizey = (opts->gpu_obinsizey == 0) ? 8 : opts->gpu_obinsizey;
+      opts->gpu_obinsizez = (opts->gpu_obinsizez == 0) ? 8 : opts->gpu_obinsizez;
+      opts->gpu_binsizex  = (opts->gpu_binsizex == 0) ? 4 : opts->gpu_binsizex;
+      opts->gpu_binsizey  = (opts->gpu_binsizey == 0) ? 4 : opts->gpu_binsizey;
+      opts->gpu_binsizez  = (opts->gpu_binsizez == 0) ? 4 : opts->gpu_binsizez;
+    } break;
+    }
+  } break;
+  }
+}
+
 template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f,
                                             std::complex<double> *a, float *fwkerhalf,
                                             finufft_spread_opts opts);
@@ -227,5 +340,15 @@ template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf,
                                     finufft_spread_opts opts);
 template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf,
                                     finufft_spread_opts opts);
+
+template std::size_t shared_memory_required<float>(int dim, int ns, int bin_size_x,
+                                                   int bin_size_y, int bin_size_z);
+template std::size_t shared_memory_required<double>(int dim, int ns, int bin_size_x,
+                                                    int bin_size_y, int bin_size_z);
+
+template void cufinufft_setup_binsize<float>(int type, int ns, int dim,
+                                             cufinufft_opts *opts);
+template void cufinufft_setup_binsize<double>(int type, int ns, int dim,
+                                              cufinufft_opts *opts);
 } // namespace common
 } // namespace cufinufft
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index c0066d049..c00bf8eba 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -102,26 +102,26 @@ void cufinufft_default_opts(cufinufft_opts *opts)
 {
   // sphinx tag (don't remove): @gpu_defopts_start
   // data handling opts...
-  opts->modeord = 0;
+  opts->modeord       = 0;
   opts->gpu_device_id = 0;
 
   // diagnostic opts...
   opts->gpu_spreadinterponly = 0;
 
   // algorithm performance opts...
-  opts->gpu_method = 0;
-  opts->gpu_sort = 1;
-  opts->gpu_kerevalmeth = 1;
-  opts->upsampfac = 2.0;
+  opts->gpu_method         = 0;
+  opts->gpu_sort           = 1;
+  opts->gpu_kerevalmeth    = 1;
+  opts->upsampfac          = 2.0;
   opts->gpu_maxsubprobsize = 1024;
-  opts->gpu_obinsizex      = -1;
-  opts->gpu_obinsizey      = -1;
-  opts->gpu_obinsizez      = -1;
-  opts->gpu_binsizex = -1;
-  opts->gpu_binsizey = -1;
-  opts->gpu_binsizez = -1;
-  opts->gpu_maxbatchsize = 0;
-  opts->gpu_stream       = cudaStreamDefault;
+  opts->gpu_obinsizex      = 0;
+  opts->gpu_obinsizey      = 0;
+  opts->gpu_obinsizez      = 0;
+  opts->gpu_binsizex       = 0;
+  opts->gpu_binsizey       = 0;
+  opts->gpu_binsizez       = 0;
+  opts->gpu_maxbatchsize   = 0;
+  opts->gpu_stream         = cudaStreamDefault;
   // sphinx tag (don't remove): @gpu_defopts_end
 }
 }
diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu
index 66cc5ca69..b2c0c292f 100644
--- a/src/cuda/precision_independent.cu
+++ b/src/cuda/precision_independent.cu
@@ -52,13 +52,6 @@ __global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstart
   }
 }
 
-__global__ void trivial_global_sort_index_1d(int M, int *index) {
-  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
-       i += gridDim.x * blockDim.x) {
-    index[i] = i;
-  }
-}
-
 /* spreadinterp 2d */
 __global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize,
                                 int numbins) {
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 6ff91f8ca..98b5382bc 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -22,7 +22,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
 // Must call before any kernel evals done.
 // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h)
 {
-  if (upsampfac != 2.0) { // nonstandard sigma
+  if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma
     if (kerevalmeth == 1) {
       fprintf(stderr,
               "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",
@@ -69,7 +69,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
     ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
   opts.nspread      = ns;
-  opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
+  opts.ES_halfwidth = T(ns * .5); // constants to help ker eval (except Horner)
   opts.ES_c         = 4.0 / (T)(ns * ns);
 
   T betaoverns = 2.30;            // gives decent betas for default sigma=2.0
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 8d77d9fdc..d9c5d312b 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 file(GLOB test_src "*.c*")
 
 foreach(srcfile ${test_src})
@@ -6,79 +5,83 @@ foreach(srcfile ${test_src})
   get_filename_component(executable ${executable} NAME)
   add_executable(${executable} ${srcfile})
   target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-  target_link_libraries(${executable} PUBLIC cufinufft m)
-  set_target_properties(${executable} PROPERTIES
-          LINKER_LANGUAGE CUDA
-          CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
-  )
+  find_library(MathLib m)
+  if(MathLib)
+    target_link_libraries(${executable} PUBLIC cufinufft ${MathLib})
+  endif()
+  target_compile_features(${executable} PUBLIC cxx_std_17)
+  set_target_properties(
+    ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES
+                                                  ${FINUFFT_CUDA_ARCHITECTURES})
   message(STATUS "Adding test ${executable}"
-          " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}"
-          " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}"
-  )
+                 " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}"
+                 " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}")
 endforeach()
 
-function(add_tests PREC REQ_TOL CHECK_TOL)
-  add_test(
-    NAME cufinufft1d1_test_GM_${PREC}
-    COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
+  add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
 
-  add_test(
-    NAME cufinufft1d1_test_SM_${PREC}
-    COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
 
-  add_test(
-    NAME cufinufft1d2_test_GM_${PREC}
-    COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d1_test_GM_${PREC}
-    COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d1_test_SM_${PREC}
-    COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d1many_test_GM_${PREC}
-    COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d1many_test_SM_${PREC}
-    COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d2many_test_GM_${PREC}
-    COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft2d2many_test_SM_${PREC}
-    COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP}
+           COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL}
+                   ${CHECK_TOL} ${PREC} ${UPSAMP})
 
-  add_test(
-    NAME cufinufft3d1_test_GM_${PREC}
-    COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
 
-  if (${PREC} STREQUAL "float")
-    add_test(
-      NAME cufinufft3d1_test_SM_${PREC}
-      COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  if(${PREC} STREQUAL "float")
+    add_test(NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP}
+             COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                     ${PREC} ${UPSAMP})
 
-    add_test(
-      NAME cufinufft3d1_test_block_${PREC}
-      COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+    add_test(NAME cufinufft3d1_test_block_${PREC}_${UPSAMP}
+             COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                     ${PREC} ${UPSAMP})
 
-    add_test(
-      NAME cufinufft3d2_test_SM_${PREC}
-      COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+    add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP}
+             COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                     ${PREC} ${UPSAMP})
   endif()
 
-  add_test(
-    NAME cufinufft3d2_test_GM_${PREC}
-    COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC})
+  add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
 endfunction()
 
-add_tests(float 1e-5 2e-4)
-add_tests(double 1e-12 1e-11)
+add_tests(float 1e-5 2e-4 2.0)
+add_tests(double 1e-12 1e-11 2.0)
+add_tests(float 1e-5 2e-4 1.25)
+add_tests(double 1e-8 1e-7 1.25)
 
 add_test(NAME cufinufft_public_api COMMAND public_api_test)
 add_test(NAME cufinufft_makeplan COMMAND test_makeplan)
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index 05b62025e..dbd6260ac 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -17,7 +17,8 @@
 using cufinufft::utils::infnorm;
 
 template<typename T>
-int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) {
+int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag,
+             double upsampfac) {
   std::cout << std::scientific << std::setprecision(3);
   int ier;
 
@@ -88,6 +89,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag)
 
   opts.gpu_method       = method;
   opts.gpu_maxbatchsize = 1;
+  opts.upsampfac        = upsampfac;
 
   int nmodes[3] = {N1, 1, 1};
   int ntransf   = 1;
@@ -178,7 +180,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag)
 }
 
 int main(int argc, char *argv[]) {
-  if (argc != 8) {
+  if (argc != 9) {
     fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n"
                     "Arguments:\n"
                     "  method: One of\n"
@@ -188,21 +190,23 @@ int main(int argc, char *argv[]) {
                     "  M: The number of non-uniform points\n"
                     "  tol: NUFFT tolerance\n"
                     "  checktol:  relative error to pass test\n"
-                    "  precision: f or d\n");
+                    "  precision: f or d\n"
+                    "  upsampfac: upsampling factor\n");
     return 1;
   }
-  const int method      = atoi(argv[1]);
-  const int type        = atoi(argv[2]);
-  const int N1          = atof(argv[3]);
-  const int M           = atof(argv[4]);
-  const double tol      = atof(argv[5]);
-  const double checktol = atof(argv[6]);
-  const int iflag       = 1;
-  const char prec       = argv[7][0];
+  const int method       = atoi(argv[1]);
+  const int type         = atoi(argv[2]);
+  const int N1           = atof(argv[3]);
+  const int M            = atof(argv[4]);
+  const double tol       = atof(argv[5]);
+  const double checktol  = atof(argv[6]);
+  const int iflag        = 1;
+  const char prec        = argv[7][0];
+  const double upsampfac = atof(argv[8]);
   if (prec == 'f')
-    return run_test<float>(method, type, N1, M, tol, checktol, iflag);
+    return run_test<float>(method, type, N1, M, tol, checktol, iflag, upsampfac);
   else if (prec == 'd')
-    return run_test<double>(method, type, N1, M, tol, checktol, iflag);
+    return run_test<double>(method, type, N1, M, tol, checktol, iflag, upsampfac);
   else
     return -1;
 }
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index 4157f6230..f3b767f2e 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -18,7 +18,8 @@
 using cufinufft::utils::infnorm;
 
 template<typename T>
-int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) {
+int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag,
+             double upsampfac) {
   std::cout << std::scientific << std::setprecision(3);
 
   thrust::host_vector<T> x(M), y(M);
@@ -88,9 +89,9 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
 
   opts.gpu_method       = method;
   opts.gpu_maxbatchsize = 1;
-
-  int nmodes[3] = {N1, N2, 1};
-  int ntransf   = 1;
+  opts.upsampfac        = upsampfac;
+  int nmodes[3]         = {N1, N2, 1};
+  int ntransf           = 1;
   cudaEventRecord(start);
   int ier =
       cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
@@ -178,7 +179,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
 }
 
 int main(int argc, char *argv[]) {
-  if (argc != 9) {
+  if (argc != 10) {
     fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n"
                     "Arguments:\n"
                     "  method: One of\n"
@@ -189,23 +190,25 @@ int main(int argc, char *argv[]) {
                     "  M: The number of non-uniform points\n"
                     "  tol: NUFFT tolerance\n"
                     "  checktol: relative error to pass test\n"
-                    "  prec:  'f' or 'd' (float/double)\n");
+                    "  prec:  'f' or 'd' (float/double)\n"
+                    "  upsampfac: upsampling factor\n");
     return 1;
   }
-  const int method      = atoi(argv[1]);
-  const int type        = atoi(argv[2]);
-  const int N1          = atof(argv[3]);
-  const int N2          = atof(argv[4]);
-  const int M           = atof(argv[5]);
-  const double tol      = atof(argv[6]);
-  const double checktol = atof(argv[7]);
-  const char prec       = argv[8][0];
-  const int iflag       = 1;
+  const int method       = atoi(argv[1]);
+  const int type         = atoi(argv[2]);
+  const int N1           = atof(argv[3]);
+  const int N2           = atof(argv[4]);
+  const int M            = atof(argv[5]);
+  const double tol       = atof(argv[6]);
+  const double checktol  = atof(argv[7]);
+  const char prec        = argv[8][0];
+  const double upsampfac = atof(argv[9]);
+  const int iflag        = 1;
 
   if (prec == 'f')
-    return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag);
+    return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag, upsampfac);
   else if (prec == 'd')
-    return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag);
+    return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag, upsampfac);
   else
     return -1;
 }
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index b4f3529e1..4afcd97dd 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -19,7 +19,7 @@ using cufinufft::utils::infnorm;
 
 template<typename T>
 int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M,
-             T tol, T checktol, int iflag) {
+             T tol, T checktol, int iflag, double upsampfac) {
   std::cout << std::scientific << std::setprecision(3);
 
   int ier;
@@ -93,6 +93,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 
   opts.gpu_method       = method;
   opts.gpu_maxbatchsize = maxbatchsize;
+  opts.upsampfac        = upsampfac;
 
   int nmodes[3] = {N1, N2, 1};
   cudaEventRecord(start);
@@ -184,7 +185,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 }
 
 int main(int argc, char *argv[]) {
-  if (argc != 11) {
+  if (argc != 12) {
     fprintf(stderr,
             "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M "
             "tol checktol prec\n"
@@ -199,7 +200,8 @@ int main(int argc, char *argv[]) {
             "  M: The number of non-uniform points\n"
             "  tol: NUFFT tolerance\n"
             "  checktol: relative error to pass test\n"
-            "  prec:  'f' or 'd' (float/double)\n");
+            "  prec:  'f' or 'd' (float/double)\n"
+            "  upsampfac: upsampling factor\n");
     return 1;
   }
   const int method       = atoi(argv[1]);
@@ -212,14 +214,15 @@ int main(int argc, char *argv[]) {
   const double tol       = atof(argv[8]);
   const double checktol  = atof(argv[9]);
   const char prec        = argv[10][0];
+  const double upsampfac = atof(argv[11]);
   const int iflag        = 1;
 
   if (prec == 'f')
     return run_test<float>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
-                           iflag);
+                           iflag, upsampfac);
   else if (prec == 'd')
     return run_test<double>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
-                            iflag);
+                            iflag, upsampfac);
   else
     return -1;
 }
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index 933dda36d..67818c2b2 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -19,7 +19,7 @@ using cufinufft::utils::infnorm;
 
 template<typename T>
 int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol,
-             int iflag) {
+             int iflag, double upsampfac) {
   std::cout << std::scientific << std::setprecision(3);
   int ier;
 
@@ -94,9 +94,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
   opts.gpu_method       = method;
   opts.gpu_kerevalmeth  = 1;
   opts.gpu_maxbatchsize = 1;
-
-  int nmodes[3] = {N1, N2, N3};
-  int ntransf   = 1;
+  opts.upsampfac        = upsampfac;
+  int nmodes[3]         = {N1, N2, N3};
+  int ntransf           = 1;
 
   cudaEventRecord(start);
   ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
@@ -190,7 +190,7 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
 }
 
 int main(int argc, char *argv[]) {
-  if (argc < 10) {
+  if (argc != 11) {
     fprintf(stderr,
             "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n"
             "Arguments:\n"
@@ -203,24 +203,26 @@ int main(int argc, char *argv[]) {
             "  M: The number of non-uniform points\n"
             "  tol: NUFFT tolerance\n"
             "  checktol:  relative error to pass test\n"
-            "  prec:  'f' or 'd' (float/double)\n");
+            "  prec:  'f' or 'd' (float/double)\n"
+            "  upsamplefac: upsampling factor\n");
     return 1;
   }
-  const int method      = atoi(argv[1]);
-  const int type        = atoi(argv[2]);
-  const int N1          = atof(argv[3]);
-  const int N2          = atof(argv[4]);
-  const int N3          = atof(argv[5]);
-  const int M           = atof(argv[6]);
-  const double tol      = atof(argv[7]);
-  const double checktol = atof(argv[8]);
-  const char prec       = argv[9][0];
-  const int iflag       = 1;
+  const int method       = atoi(argv[1]);
+  const int type         = atoi(argv[2]);
+  const int N1           = atof(argv[3]);
+  const int N2           = atof(argv[4]);
+  const int N3           = atof(argv[5]);
+  const int M            = atof(argv[6]);
+  const double tol       = atof(argv[7]);
+  const double checktol  = atof(argv[8]);
+  const char prec        = argv[9][0];
+  const double upsampfac = atof(argv[10]);
+  const int iflag        = 1;
 
   if (prec == 'f')
-    return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+    return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac);
   else if (prec == 'd')
-    return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+    return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac);
   else
     return -1;
 }